blob: dd8d7b223232a8f4d125d569ba5c2b2cac60ea64 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000050/* --- Globals ------------------------------------------------------------
51
52 The globals are initialized by the _PyUnicode_Init() API and should
53 not be used before calling that API.
54
55*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000057
58#ifdef __cplusplus
59extern "C" {
60#endif
61
Victor Stinner8faf8212011-12-08 22:14:11 +010062/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
63#define MAX_UNICODE 0x10ffff
64
Victor Stinner910337b2011-10-03 03:20:16 +020065#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020066# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020067#else
68# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
69#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020070
Victor Stinnere90fe6a2011-10-01 16:48:13 +020071#define _PyUnicode_UTF8(op) \
72 (((PyCompactUnicodeObject*)(op))->utf8)
73#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020074 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020075 assert(PyUnicode_IS_READY(op)), \
76 PyUnicode_IS_COMPACT_ASCII(op) ? \
77 ((char*)((PyASCIIObject*)(op) + 1)) : \
78 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020079#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020080 (((PyCompactUnicodeObject*)(op))->utf8_length)
81#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020082 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020083 assert(PyUnicode_IS_READY(op)), \
84 PyUnicode_IS_COMPACT_ASCII(op) ? \
85 ((PyASCIIObject*)(op))->length : \
86 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020087#define _PyUnicode_WSTR(op) \
88 (((PyASCIIObject*)(op))->wstr)
89#define _PyUnicode_WSTR_LENGTH(op) \
90 (((PyCompactUnicodeObject*)(op))->wstr_length)
91#define _PyUnicode_LENGTH(op) \
92 (((PyASCIIObject *)(op))->length)
93#define _PyUnicode_STATE(op) \
94 (((PyASCIIObject *)(op))->state)
95#define _PyUnicode_HASH(op) \
96 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +020097#define _PyUnicode_KIND(op) \
98 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020099 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200100#define _PyUnicode_GET_LENGTH(op) \
101 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200102 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200103#define _PyUnicode_DATA_ANY(op) \
104 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200105
Victor Stinnere6abb482012-05-02 01:15:40 +0200106/* Optimized version of Py_MAX() to compute the maximum character:
107 use it when your are computing the second argument of PyUnicode_New() */
108#define MAX_MAXCHAR(maxchar1, maxchar2) \
109 ((maxchar1) | (maxchar2))
110
Victor Stinner910337b2011-10-03 03:20:16 +0200111#undef PyUnicode_READY
112#define PyUnicode_READY(op) \
113 (assert(_PyUnicode_CHECK(op)), \
114 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200115 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100116 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200117
Victor Stinnerc379ead2011-10-03 12:52:27 +0200118#define _PyUnicode_SHARE_UTF8(op) \
119 (assert(_PyUnicode_CHECK(op)), \
120 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
121 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
122#define _PyUnicode_SHARE_WSTR(op) \
123 (assert(_PyUnicode_CHECK(op)), \
124 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
125
Victor Stinner829c0ad2011-10-03 01:08:02 +0200126/* true if the Unicode object has an allocated UTF-8 memory block
127 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200128#define _PyUnicode_HAS_UTF8_MEMORY(op) \
129 (assert(_PyUnicode_CHECK(op)), \
130 (!PyUnicode_IS_COMPACT_ASCII(op) \
131 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200132 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
133
Victor Stinner03490912011-10-03 23:45:12 +0200134/* true if the Unicode object has an allocated wstr memory block
135 (not shared with other data) */
136#define _PyUnicode_HAS_WSTR_MEMORY(op) \
137 (assert(_PyUnicode_CHECK(op)), \
138 (_PyUnicode_WSTR(op) && \
139 (!PyUnicode_IS_READY(op) || \
140 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
141
Victor Stinner910337b2011-10-03 03:20:16 +0200142/* Generic helper macro to convert characters of different types.
143 from_type and to_type have to be valid type names, begin and end
144 are pointers to the source characters which should be of type
145 "from_type *". to is a pointer of type "to_type *" and points to the
146 buffer where the result characters are written to. */
147#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
148 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200149 to_type *_to = (to_type *) to; \
150 const from_type *_iter = (begin); \
151 const from_type *_end = (end); \
152 Py_ssize_t n = (_end) - (_iter); \
153 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200154 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200155 while (_iter < (_unrolled_end)) { \
156 _to[0] = (to_type) _iter[0]; \
157 _to[1] = (to_type) _iter[1]; \
158 _to[2] = (to_type) _iter[2]; \
159 _to[3] = (to_type) _iter[3]; \
160 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200161 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200162 while (_iter < (_end)) \
163 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200164 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200165
Walter Dörwald16807132007-05-25 13:52:07 +0000166/* This dictionary holds all interned unicode strings. Note that references
167 to strings in this dictionary are *not* counted in the string's ob_refcnt.
168 When the interned string reaches a refcnt of 0 the string deallocation
169 function will delete the reference from this dictionary.
170
171 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000172 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000173*/
174static PyObject *interned;
175
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000176/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200177static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000178
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200179/* List of static strings. */
180static _Py_Identifier *static_strings;
181
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000182/* Single character Unicode strings in the Latin-1 range are being
183 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200184static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000185
Christian Heimes190d79e2008-01-30 11:58:22 +0000186/* Fast detection of the most frequent whitespace characters */
187const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000188 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000189/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000190/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000191/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000192/* case 0x000C: * FORM FEED */
193/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000194 0, 1, 1, 1, 1, 1, 0, 0,
195 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000196/* case 0x001C: * FILE SEPARATOR */
197/* case 0x001D: * GROUP SEPARATOR */
198/* case 0x001E: * RECORD SEPARATOR */
199/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000200 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000201/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000202 1, 0, 0, 0, 0, 0, 0, 0,
203 0, 0, 0, 0, 0, 0, 0, 0,
204 0, 0, 0, 0, 0, 0, 0, 0,
205 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000206
Benjamin Peterson14339b62009-01-31 16:36:08 +0000207 0, 0, 0, 0, 0, 0, 0, 0,
208 0, 0, 0, 0, 0, 0, 0, 0,
209 0, 0, 0, 0, 0, 0, 0, 0,
210 0, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000215};
216
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200217/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200218static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200219static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100220static int unicode_modifiable(PyObject *unicode);
221
Victor Stinnerfe226c02011-10-03 03:52:20 +0200222
Alexander Belopolsky40018472011-02-26 01:02:56 +0000223static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200224_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
225static PyObject *
226_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
227static PyObject *
228_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
229
230static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000231unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000232 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100233 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000234 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
235
Alexander Belopolsky40018472011-02-26 01:02:56 +0000236static void
237raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300238 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100239 PyObject *unicode,
240 Py_ssize_t startpos, Py_ssize_t endpos,
241 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000242
Christian Heimes190d79e2008-01-30 11:58:22 +0000243/* Same for linebreaks */
244static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000245 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000246/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000247/* 0x000B, * LINE TABULATION */
248/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000249/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000250 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000251 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000252/* 0x001C, * FILE SEPARATOR */
253/* 0x001D, * GROUP SEPARATOR */
254/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000255 0, 0, 0, 0, 1, 1, 1, 0,
256 0, 0, 0, 0, 0, 0, 0, 0,
257 0, 0, 0, 0, 0, 0, 0, 0,
258 0, 0, 0, 0, 0, 0, 0, 0,
259 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000260
Benjamin Peterson14339b62009-01-31 16:36:08 +0000261 0, 0, 0, 0, 0, 0, 0, 0,
262 0, 0, 0, 0, 0, 0, 0, 0,
263 0, 0, 0, 0, 0, 0, 0, 0,
264 0, 0, 0, 0, 0, 0, 0, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
266 0, 0, 0, 0, 0, 0, 0, 0,
267 0, 0, 0, 0, 0, 0, 0, 0,
268 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000269};
270
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300271/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
272 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000273Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000274PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000275{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000276#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000277 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000278#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000279 /* This is actually an illegal character, so it should
280 not be passed to unichr. */
281 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000282#endif
283}
284
Victor Stinner910337b2011-10-03 03:20:16 +0200285#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200286int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100287_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200288{
289 PyASCIIObject *ascii;
290 unsigned int kind;
291
292 assert(PyUnicode_Check(op));
293
294 ascii = (PyASCIIObject *)op;
295 kind = ascii->state.kind;
296
Victor Stinnera3b334d2011-10-03 13:53:37 +0200297 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200298 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200299 assert(ascii->state.ready == 1);
300 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200301 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200302 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200303 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200304
Victor Stinnera41463c2011-10-04 01:05:08 +0200305 if (ascii->state.compact == 1) {
306 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200307 assert(kind == PyUnicode_1BYTE_KIND
308 || kind == PyUnicode_2BYTE_KIND
309 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200310 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200311 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200312 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100313 }
314 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200315 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
316
317 data = unicode->data.any;
318 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100319 assert(ascii->length == 0);
320 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200321 assert(ascii->state.compact == 0);
322 assert(ascii->state.ascii == 0);
323 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100324 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200325 assert(ascii->wstr != NULL);
326 assert(data == NULL);
327 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200328 }
329 else {
330 assert(kind == PyUnicode_1BYTE_KIND
331 || kind == PyUnicode_2BYTE_KIND
332 || kind == PyUnicode_4BYTE_KIND);
333 assert(ascii->state.compact == 0);
334 assert(ascii->state.ready == 1);
335 assert(data != NULL);
336 if (ascii->state.ascii) {
337 assert (compact->utf8 == data);
338 assert (compact->utf8_length == ascii->length);
339 }
340 else
341 assert (compact->utf8 != data);
342 }
343 }
344 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200345 if (
346#if SIZEOF_WCHAR_T == 2
347 kind == PyUnicode_2BYTE_KIND
348#else
349 kind == PyUnicode_4BYTE_KIND
350#endif
351 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200352 {
353 assert(ascii->wstr == data);
354 assert(compact->wstr_length == ascii->length);
355 } else
356 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200357 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200358
359 if (compact->utf8 == NULL)
360 assert(compact->utf8_length == 0);
361 if (ascii->wstr == NULL)
362 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200363 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200364 /* check that the best kind is used */
365 if (check_content && kind != PyUnicode_WCHAR_KIND)
366 {
367 Py_ssize_t i;
368 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200369 void *data;
370 Py_UCS4 ch;
371
372 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200373 for (i=0; i < ascii->length; i++)
374 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200375 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200376 if (ch > maxchar)
377 maxchar = ch;
378 }
379 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100380 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200381 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100382 assert(maxchar <= 255);
383 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200384 else
385 assert(maxchar < 128);
386 }
Victor Stinner77faf692011-11-20 18:56:05 +0100387 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200388 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100389 assert(maxchar <= 0xFFFF);
390 }
391 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200392 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100393 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100394 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200395 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200396 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400397 return 1;
398}
Victor Stinner910337b2011-10-03 03:20:16 +0200399#endif
400
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100401static PyObject*
402unicode_result_wchar(PyObject *unicode)
403{
404#ifndef Py_DEBUG
405 Py_ssize_t len;
406
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100407 len = _PyUnicode_WSTR_LENGTH(unicode);
408 if (len == 0) {
409 Py_INCREF(unicode_empty);
410 Py_DECREF(unicode);
411 return unicode_empty;
412 }
413
414 if (len == 1) {
415 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
416 if (ch < 256) {
417 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
418 Py_DECREF(unicode);
419 return latin1_char;
420 }
421 }
422
423 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200424 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100425 return NULL;
426 }
427#else
Victor Stinneraa771272012-10-04 02:32:58 +0200428 assert(Py_REFCNT(unicode) == 1);
429
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100430 /* don't make the result ready in debug mode to ensure that the caller
431 makes the string ready before using it */
432 assert(_PyUnicode_CheckConsistency(unicode, 1));
433#endif
434 return unicode;
435}
436
437static PyObject*
438unicode_result_ready(PyObject *unicode)
439{
440 Py_ssize_t length;
441
442 length = PyUnicode_GET_LENGTH(unicode);
443 if (length == 0) {
444 if (unicode != unicode_empty) {
445 Py_INCREF(unicode_empty);
446 Py_DECREF(unicode);
447 }
448 return unicode_empty;
449 }
450
451 if (length == 1) {
452 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
453 if (ch < 256) {
454 PyObject *latin1_char = unicode_latin1[ch];
455 if (latin1_char != NULL) {
456 if (unicode != latin1_char) {
457 Py_INCREF(latin1_char);
458 Py_DECREF(unicode);
459 }
460 return latin1_char;
461 }
462 else {
463 assert(_PyUnicode_CheckConsistency(unicode, 1));
464 Py_INCREF(unicode);
465 unicode_latin1[ch] = unicode;
466 return unicode;
467 }
468 }
469 }
470
471 assert(_PyUnicode_CheckConsistency(unicode, 1));
472 return unicode;
473}
474
475static PyObject*
476unicode_result(PyObject *unicode)
477{
478 assert(_PyUnicode_CHECK(unicode));
479 if (PyUnicode_IS_READY(unicode))
480 return unicode_result_ready(unicode);
481 else
482 return unicode_result_wchar(unicode);
483}
484
Victor Stinnerc4b49542011-12-11 22:44:26 +0100485static PyObject*
486unicode_result_unchanged(PyObject *unicode)
487{
488 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500489 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100490 return NULL;
491 Py_INCREF(unicode);
492 return unicode;
493 }
494 else
495 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100496 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100497}
498
Victor Stinner3a50e702011-10-18 21:21:00 +0200499#ifdef HAVE_MBCS
500static OSVERSIONINFOEX winver;
501#endif
502
Thomas Wouters477c8d52006-05-27 19:21:47 +0000503/* --- Bloom Filters ----------------------------------------------------- */
504
505/* stuff to implement simple "bloom filters" for Unicode characters.
506 to keep things simple, we use a single bitmask, using the least 5
507 bits from each unicode characters as the bit index. */
508
509/* the linebreak mask is set up by Unicode_Init below */
510
Antoine Pitrouf068f942010-01-13 14:19:12 +0000511#if LONG_BIT >= 128
512#define BLOOM_WIDTH 128
513#elif LONG_BIT >= 64
514#define BLOOM_WIDTH 64
515#elif LONG_BIT >= 32
516#define BLOOM_WIDTH 32
517#else
518#error "LONG_BIT is smaller than 32"
519#endif
520
Thomas Wouters477c8d52006-05-27 19:21:47 +0000521#define BLOOM_MASK unsigned long
522
523static BLOOM_MASK bloom_linebreak;
524
Antoine Pitrouf068f942010-01-13 14:19:12 +0000525#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
526#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000527
Benjamin Peterson29060642009-01-31 22:14:21 +0000528#define BLOOM_LINEBREAK(ch) \
529 ((ch) < 128U ? ascii_linebreak[(ch)] : \
530 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000531
Alexander Belopolsky40018472011-02-26 01:02:56 +0000532Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200533make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000534{
535 /* calculate simple bloom-style bitmask for a given unicode string */
536
Antoine Pitrouf068f942010-01-13 14:19:12 +0000537 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000538 Py_ssize_t i;
539
540 mask = 0;
541 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200542 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000543
544 return mask;
545}
546
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200547#define BLOOM_MEMBER(mask, chr, str) \
548 (BLOOM(mask, chr) \
549 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000550
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200551/* Compilation of templated routines */
552
553#include "stringlib/asciilib.h"
554#include "stringlib/fastsearch.h"
555#include "stringlib/partition.h"
556#include "stringlib/split.h"
557#include "stringlib/count.h"
558#include "stringlib/find.h"
559#include "stringlib/find_max_char.h"
560#include "stringlib/localeutil.h"
561#include "stringlib/undef.h"
562
563#include "stringlib/ucs1lib.h"
564#include "stringlib/fastsearch.h"
565#include "stringlib/partition.h"
566#include "stringlib/split.h"
567#include "stringlib/count.h"
568#include "stringlib/find.h"
569#include "stringlib/find_max_char.h"
570#include "stringlib/localeutil.h"
571#include "stringlib/undef.h"
572
573#include "stringlib/ucs2lib.h"
574#include "stringlib/fastsearch.h"
575#include "stringlib/partition.h"
576#include "stringlib/split.h"
577#include "stringlib/count.h"
578#include "stringlib/find.h"
579#include "stringlib/find_max_char.h"
580#include "stringlib/localeutil.h"
581#include "stringlib/undef.h"
582
583#include "stringlib/ucs4lib.h"
584#include "stringlib/fastsearch.h"
585#include "stringlib/partition.h"
586#include "stringlib/split.h"
587#include "stringlib/count.h"
588#include "stringlib/find.h"
589#include "stringlib/find_max_char.h"
590#include "stringlib/localeutil.h"
591#include "stringlib/undef.h"
592
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200593#include "stringlib/unicodedefs.h"
594#include "stringlib/fastsearch.h"
595#include "stringlib/count.h"
596#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100597#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200598
Guido van Rossumd57fd912000-03-10 22:53:23 +0000599/* --- Unicode Object ----------------------------------------------------- */
600
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200601static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200602fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200603
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200604Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
605 Py_ssize_t size, Py_UCS4 ch,
606 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200607{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200608 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
609
610 switch (kind) {
611 case PyUnicode_1BYTE_KIND:
612 {
613 Py_UCS1 ch1 = (Py_UCS1) ch;
614 if (ch1 == ch)
615 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
616 else
617 return -1;
618 }
619 case PyUnicode_2BYTE_KIND:
620 {
621 Py_UCS2 ch2 = (Py_UCS2) ch;
622 if (ch2 == ch)
623 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
624 else
625 return -1;
626 }
627 case PyUnicode_4BYTE_KIND:
628 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
629 default:
630 assert(0);
631 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200632 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200633}
634
Victor Stinnerafffce42012-10-03 23:03:17 +0200635#ifdef Py_DEBUG
636/* Fill the data of an Unicode string with invalid characters to detect bugs
637 earlier.
638
639 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
640 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
641 invalid character in Unicode 6.0. */
642static void
643unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
644{
645 int kind = PyUnicode_KIND(unicode);
646 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
647 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
648 if (length <= old_length)
649 return;
650 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
651}
652#endif
653
Victor Stinnerfe226c02011-10-03 03:52:20 +0200654static PyObject*
655resize_compact(PyObject *unicode, Py_ssize_t length)
656{
657 Py_ssize_t char_size;
658 Py_ssize_t struct_size;
659 Py_ssize_t new_size;
660 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100661 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200662#ifdef Py_DEBUG
663 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
664#endif
665
Victor Stinner79891572012-05-03 13:43:07 +0200666 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200667 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100668 assert(PyUnicode_IS_COMPACT(unicode));
669
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200670 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100671 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200672 struct_size = sizeof(PyASCIIObject);
673 else
674 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200675 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200676
Victor Stinnerfe226c02011-10-03 03:52:20 +0200677 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
678 PyErr_NoMemory();
679 return NULL;
680 }
681 new_size = (struct_size + (length + 1) * char_size);
682
Victor Stinner84def372011-12-11 20:04:56 +0100683 _Py_DEC_REFTOTAL;
684 _Py_ForgetReference(unicode);
685
686 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
687 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100688 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200689 PyErr_NoMemory();
690 return NULL;
691 }
Victor Stinner84def372011-12-11 20:04:56 +0100692 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200693 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100694
Victor Stinnerfe226c02011-10-03 03:52:20 +0200695 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200696 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200697 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100698 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200699 _PyUnicode_WSTR_LENGTH(unicode) = length;
700 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200701#ifdef Py_DEBUG
702 unicode_fill_invalid(unicode, old_length);
703#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200704 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
705 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200706 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200707 return unicode;
708}
709
Alexander Belopolsky40018472011-02-26 01:02:56 +0000710static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200711resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000712{
Victor Stinner95663112011-10-04 01:03:50 +0200713 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100714 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200715 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200716 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000717
Victor Stinnerfe226c02011-10-03 03:52:20 +0200718 if (PyUnicode_IS_READY(unicode)) {
719 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200720 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200721 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200722#ifdef Py_DEBUG
723 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
724#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200725
726 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200727 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200728 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
729 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200730
731 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
732 PyErr_NoMemory();
733 return -1;
734 }
735 new_size = (length + 1) * char_size;
736
Victor Stinner7a9105a2011-12-12 00:13:42 +0100737 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
738 {
739 PyObject_DEL(_PyUnicode_UTF8(unicode));
740 _PyUnicode_UTF8(unicode) = NULL;
741 _PyUnicode_UTF8_LENGTH(unicode) = 0;
742 }
743
Victor Stinnerfe226c02011-10-03 03:52:20 +0200744 data = (PyObject *)PyObject_REALLOC(data, new_size);
745 if (data == NULL) {
746 PyErr_NoMemory();
747 return -1;
748 }
749 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200750 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200751 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200752 _PyUnicode_WSTR_LENGTH(unicode) = length;
753 }
754 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200755 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200756 _PyUnicode_UTF8_LENGTH(unicode) = length;
757 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200758 _PyUnicode_LENGTH(unicode) = length;
759 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200760#ifdef Py_DEBUG
761 unicode_fill_invalid(unicode, old_length);
762#endif
Victor Stinner95663112011-10-04 01:03:50 +0200763 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200764 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200765 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200766 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200767 }
Victor Stinner95663112011-10-04 01:03:50 +0200768 assert(_PyUnicode_WSTR(unicode) != NULL);
769
770 /* check for integer overflow */
771 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
772 PyErr_NoMemory();
773 return -1;
774 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100775 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200776 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100777 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200778 if (!wstr) {
779 PyErr_NoMemory();
780 return -1;
781 }
782 _PyUnicode_WSTR(unicode) = wstr;
783 _PyUnicode_WSTR(unicode)[length] = 0;
784 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200785 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000786 return 0;
787}
788
Victor Stinnerfe226c02011-10-03 03:52:20 +0200789static PyObject*
790resize_copy(PyObject *unicode, Py_ssize_t length)
791{
792 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100793 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200794 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100795
Benjamin Petersonbac79492012-01-14 13:34:47 -0500796 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100797 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200798
799 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
800 if (copy == NULL)
801 return NULL;
802
803 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +0200804 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200805 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200806 }
807 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200808 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100809
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200810 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200811 if (w == NULL)
812 return NULL;
813 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
814 copy_length = Py_MIN(copy_length, length);
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +0200815 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
816 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200817 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200818 }
819}
820
Guido van Rossumd57fd912000-03-10 22:53:23 +0000821/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000822 Ux0000 terminated; some code (e.g. new_identifier)
823 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000824
825 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000826 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000827
828*/
829
Alexander Belopolsky40018472011-02-26 01:02:56 +0000830static PyUnicodeObject *
831_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000832{
833 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200834 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000835
Thomas Wouters477c8d52006-05-27 19:21:47 +0000836 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000837 if (length == 0 && unicode_empty != NULL) {
838 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200839 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000840 }
841
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000842 /* Ensure we won't overflow the size. */
843 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
844 return (PyUnicodeObject *)PyErr_NoMemory();
845 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200846 if (length < 0) {
847 PyErr_SetString(PyExc_SystemError,
848 "Negative size passed to _PyUnicode_New");
849 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000850 }
851
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200852 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
853 if (unicode == NULL)
854 return NULL;
855 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
856 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
857 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100858 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000859 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100860 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000861 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200862
Jeremy Hyltond8082792003-09-16 19:41:39 +0000863 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000864 * the caller fails before initializing str -- unicode_resize()
865 * reads str[0], and the Keep-Alive optimization can keep memory
866 * allocated for str alive across a call to unicode_dealloc(unicode).
867 * We don't want unicode_resize to read uninitialized memory in
868 * that case.
869 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200870 _PyUnicode_WSTR(unicode)[0] = 0;
871 _PyUnicode_WSTR(unicode)[length] = 0;
872 _PyUnicode_WSTR_LENGTH(unicode) = length;
873 _PyUnicode_HASH(unicode) = -1;
874 _PyUnicode_STATE(unicode).interned = 0;
875 _PyUnicode_STATE(unicode).kind = 0;
876 _PyUnicode_STATE(unicode).compact = 0;
877 _PyUnicode_STATE(unicode).ready = 0;
878 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200879 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200880 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200881 _PyUnicode_UTF8(unicode) = NULL;
882 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100883 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000884 return unicode;
885}
886
Victor Stinnerf42dc442011-10-02 23:33:16 +0200887static const char*
888unicode_kind_name(PyObject *unicode)
889{
Victor Stinner42dfd712011-10-03 14:41:45 +0200890 /* don't check consistency: unicode_kind_name() is called from
891 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200892 if (!PyUnicode_IS_COMPACT(unicode))
893 {
894 if (!PyUnicode_IS_READY(unicode))
895 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600896 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200897 {
898 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200899 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200900 return "legacy ascii";
901 else
902 return "legacy latin1";
903 case PyUnicode_2BYTE_KIND:
904 return "legacy UCS2";
905 case PyUnicode_4BYTE_KIND:
906 return "legacy UCS4";
907 default:
908 return "<legacy invalid kind>";
909 }
910 }
911 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600912 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200913 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200914 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200915 return "ascii";
916 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200917 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200918 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200919 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200920 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200921 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200922 default:
923 return "<invalid compact kind>";
924 }
925}
926
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200927#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200928/* Functions wrapping macros for use in debugger */
929char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200930 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200931}
932
933void *_PyUnicode_compact_data(void *unicode) {
934 return _PyUnicode_COMPACT_DATA(unicode);
935}
936void *_PyUnicode_data(void *unicode){
937 printf("obj %p\n", unicode);
938 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
939 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
940 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
941 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
942 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
943 return PyUnicode_DATA(unicode);
944}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200945
946void
947_PyUnicode_Dump(PyObject *op)
948{
949 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200950 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
951 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
952 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200953
Victor Stinnera849a4b2011-10-03 12:12:11 +0200954 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200955 {
956 if (ascii->state.ascii)
957 data = (ascii + 1);
958 else
959 data = (compact + 1);
960 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200961 else
962 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200963 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
964
Victor Stinnera849a4b2011-10-03 12:12:11 +0200965 if (ascii->wstr == data)
966 printf("shared ");
967 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200968
Victor Stinnera3b334d2011-10-03 13:53:37 +0200969 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200970 printf(" (%zu), ", compact->wstr_length);
971 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
972 printf("shared ");
973 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200974 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200975 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200976}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200977#endif
978
979PyObject *
980PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
981{
982 PyObject *obj;
983 PyCompactUnicodeObject *unicode;
984 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +0200985 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200986 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200987 Py_ssize_t char_size;
988 Py_ssize_t struct_size;
989
990 /* Optimization for empty strings */
991 if (size == 0 && unicode_empty != NULL) {
992 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200993 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200994 }
995
Victor Stinner9e9d6892011-10-04 01:02:02 +0200996 is_ascii = 0;
997 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200998 struct_size = sizeof(PyCompactUnicodeObject);
999 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001000 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001001 char_size = 1;
1002 is_ascii = 1;
1003 struct_size = sizeof(PyASCIIObject);
1004 }
1005 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001006 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001007 char_size = 1;
1008 }
1009 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001010 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001011 char_size = 2;
1012 if (sizeof(wchar_t) == 2)
1013 is_sharing = 1;
1014 }
1015 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001016 if (maxchar > MAX_UNICODE) {
1017 PyErr_SetString(PyExc_SystemError,
1018 "invalid maximum character passed to PyUnicode_New");
1019 return NULL;
1020 }
Victor Stinner8f825062012-04-27 13:55:39 +02001021 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001022 char_size = 4;
1023 if (sizeof(wchar_t) == 4)
1024 is_sharing = 1;
1025 }
1026
1027 /* Ensure we won't overflow the size. */
1028 if (size < 0) {
1029 PyErr_SetString(PyExc_SystemError,
1030 "Negative size passed to PyUnicode_New");
1031 return NULL;
1032 }
1033 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1034 return PyErr_NoMemory();
1035
1036 /* Duplicated allocation code from _PyObject_New() instead of a call to
1037 * PyObject_New() so we are able to allocate space for the object and
1038 * it's data buffer.
1039 */
1040 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1041 if (obj == NULL)
1042 return PyErr_NoMemory();
1043 obj = PyObject_INIT(obj, &PyUnicode_Type);
1044 if (obj == NULL)
1045 return NULL;
1046
1047 unicode = (PyCompactUnicodeObject *)obj;
1048 if (is_ascii)
1049 data = ((PyASCIIObject*)obj) + 1;
1050 else
1051 data = unicode + 1;
1052 _PyUnicode_LENGTH(unicode) = size;
1053 _PyUnicode_HASH(unicode) = -1;
1054 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001055 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001056 _PyUnicode_STATE(unicode).compact = 1;
1057 _PyUnicode_STATE(unicode).ready = 1;
1058 _PyUnicode_STATE(unicode).ascii = is_ascii;
1059 if (is_ascii) {
1060 ((char*)data)[size] = 0;
1061 _PyUnicode_WSTR(unicode) = NULL;
1062 }
Victor Stinner8f825062012-04-27 13:55:39 +02001063 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001064 ((char*)data)[size] = 0;
1065 _PyUnicode_WSTR(unicode) = NULL;
1066 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001067 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001068 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001069 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001070 else {
1071 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001072 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001073 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001074 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001075 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001076 ((Py_UCS4*)data)[size] = 0;
1077 if (is_sharing) {
1078 _PyUnicode_WSTR_LENGTH(unicode) = size;
1079 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1080 }
1081 else {
1082 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1083 _PyUnicode_WSTR(unicode) = NULL;
1084 }
1085 }
Victor Stinner8f825062012-04-27 13:55:39 +02001086#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001087 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001088#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001089 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001090 return obj;
1091}
1092
1093#if SIZEOF_WCHAR_T == 2
1094/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1095 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001096 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001097
1098 This function assumes that unicode can hold one more code point than wstr
1099 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001100static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001101unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001102 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001103{
1104 const wchar_t *iter;
1105 Py_UCS4 *ucs4_out;
1106
Victor Stinner910337b2011-10-03 03:20:16 +02001107 assert(unicode != NULL);
1108 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001109 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1110 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1111
1112 for (iter = begin; iter < end; ) {
1113 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1114 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001115 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1116 && (iter+1) < end
1117 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001118 {
Victor Stinner551ac952011-11-29 22:58:13 +01001119 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001120 iter += 2;
1121 }
1122 else {
1123 *ucs4_out++ = *iter;
1124 iter++;
1125 }
1126 }
1127 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1128 _PyUnicode_GET_LENGTH(unicode)));
1129
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001130}
1131#endif
1132
Victor Stinnercd9950f2011-10-02 00:34:53 +02001133static int
Victor Stinner488fa492011-12-12 00:01:39 +01001134unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001135{
Victor Stinner488fa492011-12-12 00:01:39 +01001136 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001137 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001138 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001139 return -1;
1140 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001141 return 0;
1142}
1143
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001144static int
1145_copy_characters(PyObject *to, Py_ssize_t to_start,
1146 PyObject *from, Py_ssize_t from_start,
1147 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001148{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001149 unsigned int from_kind, to_kind;
1150 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001151
Victor Stinneree4544c2012-05-09 22:24:08 +02001152 assert(0 <= how_many);
1153 assert(0 <= from_start);
1154 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001155 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001156 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001157 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001158
Victor Stinnerd3f08822012-05-29 12:57:52 +02001159 assert(PyUnicode_Check(to));
1160 assert(PyUnicode_IS_READY(to));
1161 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1162
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001163 if (how_many == 0)
1164 return 0;
1165
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001166 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001167 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001168 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001169 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001170
Victor Stinnerf1852262012-06-16 16:38:26 +02001171#ifdef Py_DEBUG
1172 if (!check_maxchar
1173 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1174 {
1175 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1176 Py_UCS4 ch;
1177 Py_ssize_t i;
1178 for (i=0; i < how_many; i++) {
1179 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1180 assert(ch <= to_maxchar);
1181 }
1182 }
1183#endif
1184
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001185 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001186 if (check_maxchar
1187 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1188 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001189 /* Writing Latin-1 characters into an ASCII string requires to
1190 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001191 Py_UCS4 max_char;
1192 max_char = ucs1lib_find_max_char(from_data,
1193 (Py_UCS1*)from_data + how_many);
1194 if (max_char >= 128)
1195 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001196 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001197 Py_MEMCPY((char*)to_data + to_kind * to_start,
1198 (char*)from_data + from_kind * from_start,
1199 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001200 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001201 else if (from_kind == PyUnicode_1BYTE_KIND
1202 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001203 {
1204 _PyUnicode_CONVERT_BYTES(
1205 Py_UCS1, Py_UCS2,
1206 PyUnicode_1BYTE_DATA(from) + from_start,
1207 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1208 PyUnicode_2BYTE_DATA(to) + to_start
1209 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001210 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001211 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001212 && to_kind == PyUnicode_4BYTE_KIND)
1213 {
1214 _PyUnicode_CONVERT_BYTES(
1215 Py_UCS1, Py_UCS4,
1216 PyUnicode_1BYTE_DATA(from) + from_start,
1217 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1218 PyUnicode_4BYTE_DATA(to) + to_start
1219 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001220 }
1221 else if (from_kind == PyUnicode_2BYTE_KIND
1222 && to_kind == PyUnicode_4BYTE_KIND)
1223 {
1224 _PyUnicode_CONVERT_BYTES(
1225 Py_UCS2, Py_UCS4,
1226 PyUnicode_2BYTE_DATA(from) + from_start,
1227 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1228 PyUnicode_4BYTE_DATA(to) + to_start
1229 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001230 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001231 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001232 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1233
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001234 if (!check_maxchar) {
1235 if (from_kind == PyUnicode_2BYTE_KIND
1236 && to_kind == PyUnicode_1BYTE_KIND)
1237 {
1238 _PyUnicode_CONVERT_BYTES(
1239 Py_UCS2, Py_UCS1,
1240 PyUnicode_2BYTE_DATA(from) + from_start,
1241 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1242 PyUnicode_1BYTE_DATA(to) + to_start
1243 );
1244 }
1245 else if (from_kind == PyUnicode_4BYTE_KIND
1246 && to_kind == PyUnicode_1BYTE_KIND)
1247 {
1248 _PyUnicode_CONVERT_BYTES(
1249 Py_UCS4, Py_UCS1,
1250 PyUnicode_4BYTE_DATA(from) + from_start,
1251 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1252 PyUnicode_1BYTE_DATA(to) + to_start
1253 );
1254 }
1255 else if (from_kind == PyUnicode_4BYTE_KIND
1256 && to_kind == PyUnicode_2BYTE_KIND)
1257 {
1258 _PyUnicode_CONVERT_BYTES(
1259 Py_UCS4, Py_UCS2,
1260 PyUnicode_4BYTE_DATA(from) + from_start,
1261 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1262 PyUnicode_2BYTE_DATA(to) + to_start
1263 );
1264 }
1265 else {
1266 assert(0);
1267 return -1;
1268 }
1269 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001270 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001271 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001272 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001273 Py_ssize_t i;
1274
Victor Stinnera0702ab2011-09-29 14:14:38 +02001275 for (i=0; i < how_many; i++) {
1276 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001277 if (ch > to_maxchar)
1278 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001279 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1280 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001281 }
1282 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001283 return 0;
1284}
1285
Victor Stinnerd3f08822012-05-29 12:57:52 +02001286void
1287_PyUnicode_FastCopyCharacters(
1288 PyObject *to, Py_ssize_t to_start,
1289 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001290{
1291 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1292}
1293
1294Py_ssize_t
1295PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1296 PyObject *from, Py_ssize_t from_start,
1297 Py_ssize_t how_many)
1298{
1299 int err;
1300
1301 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1302 PyErr_BadInternalCall();
1303 return -1;
1304 }
1305
Benjamin Petersonbac79492012-01-14 13:34:47 -05001306 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001307 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001308 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001309 return -1;
1310
Victor Stinnerd3f08822012-05-29 12:57:52 +02001311 if (from_start < 0) {
1312 PyErr_SetString(PyExc_IndexError, "string index out of range");
1313 return -1;
1314 }
1315 if (to_start < 0) {
1316 PyErr_SetString(PyExc_IndexError, "string index out of range");
1317 return -1;
1318 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001319 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1320 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1321 PyErr_Format(PyExc_SystemError,
1322 "Cannot write %zi characters at %zi "
1323 "in a string of %zi characters",
1324 how_many, to_start, PyUnicode_GET_LENGTH(to));
1325 return -1;
1326 }
1327
1328 if (how_many == 0)
1329 return 0;
1330
Victor Stinner488fa492011-12-12 00:01:39 +01001331 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001332 return -1;
1333
1334 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1335 if (err) {
1336 PyErr_Format(PyExc_SystemError,
1337 "Cannot copy %s characters "
1338 "into a string of %s characters",
1339 unicode_kind_name(from),
1340 unicode_kind_name(to));
1341 return -1;
1342 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001343 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001344}
1345
Victor Stinner17222162011-09-28 22:15:37 +02001346/* Find the maximum code point and count the number of surrogate pairs so a
1347 correct string length can be computed before converting a string to UCS4.
1348 This function counts single surrogates as a character and not as a pair.
1349
1350 Return 0 on success, or -1 on error. */
1351static int
1352find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1353 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001354{
1355 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001356 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001357
Victor Stinnerc53be962011-10-02 21:33:54 +02001358 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001359 *num_surrogates = 0;
1360 *maxchar = 0;
1361
1362 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001363#if SIZEOF_WCHAR_T == 2
Victor Stinnerca4f2072011-11-22 03:38:40 +01001364 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1365 && (iter+1) < end
1366 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001367 {
Victor Stinner8faf8212011-12-08 22:14:11 +01001368 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001369 ++(*num_surrogates);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001370 iter += 2;
1371 }
1372 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001373#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001374 {
1375 ch = *iter;
1376 iter++;
1377 }
1378 if (ch > *maxchar) {
1379 *maxchar = ch;
1380 if (*maxchar > MAX_UNICODE) {
1381 PyErr_Format(PyExc_ValueError,
1382 "character U+%x is not in range [U+0000; U+10ffff]",
1383 ch);
1384 return -1;
1385 }
1386 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001387 }
1388 return 0;
1389}
1390
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001391int
1392_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001393{
1394 wchar_t *end;
1395 Py_UCS4 maxchar = 0;
1396 Py_ssize_t num_surrogates;
1397#if SIZEOF_WCHAR_T == 2
1398 Py_ssize_t length_wo_surrogates;
1399#endif
1400
Georg Brandl7597add2011-10-05 16:36:47 +02001401 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001402 strings were created using _PyObject_New() and where no canonical
1403 representation (the str field) has been set yet aka strings
1404 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001405 assert(_PyUnicode_CHECK(unicode));
1406 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001407 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001408 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001409 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001410 /* Actually, it should neither be interned nor be anything else: */
1411 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001412
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001413 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001414 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001415 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001416 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001417
1418 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001419 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1420 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001421 PyErr_NoMemory();
1422 return -1;
1423 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001424 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001425 _PyUnicode_WSTR(unicode), end,
1426 PyUnicode_1BYTE_DATA(unicode));
1427 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1428 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1429 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1430 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001431 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001432 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001433 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001434 }
1435 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001436 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001437 _PyUnicode_UTF8(unicode) = NULL;
1438 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001439 }
1440 PyObject_FREE(_PyUnicode_WSTR(unicode));
1441 _PyUnicode_WSTR(unicode) = NULL;
1442 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1443 }
1444 /* In this case we might have to convert down from 4-byte native
1445 wchar_t to 2-byte unicode. */
1446 else if (maxchar < 65536) {
1447 assert(num_surrogates == 0 &&
1448 "FindMaxCharAndNumSurrogatePairs() messed up");
1449
Victor Stinner506f5922011-09-28 22:34:18 +02001450#if SIZEOF_WCHAR_T == 2
1451 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001452 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001453 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1454 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1455 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001456 _PyUnicode_UTF8(unicode) = NULL;
1457 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001458#else
1459 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001460 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001461 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001462 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001463 PyErr_NoMemory();
1464 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001465 }
Victor Stinner506f5922011-09-28 22:34:18 +02001466 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1467 _PyUnicode_WSTR(unicode), end,
1468 PyUnicode_2BYTE_DATA(unicode));
1469 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1470 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1471 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001472 _PyUnicode_UTF8(unicode) = NULL;
1473 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001474 PyObject_FREE(_PyUnicode_WSTR(unicode));
1475 _PyUnicode_WSTR(unicode) = NULL;
1476 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1477#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001478 }
1479 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1480 else {
1481#if SIZEOF_WCHAR_T == 2
1482 /* in case the native representation is 2-bytes, we need to allocate a
1483 new normalized 4-byte version. */
1484 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001485 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1486 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001487 PyErr_NoMemory();
1488 return -1;
1489 }
1490 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1491 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001492 _PyUnicode_UTF8(unicode) = NULL;
1493 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001494 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1495 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001496 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001497 PyObject_FREE(_PyUnicode_WSTR(unicode));
1498 _PyUnicode_WSTR(unicode) = NULL;
1499 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1500#else
1501 assert(num_surrogates == 0);
1502
Victor Stinnerc3c74152011-10-02 20:39:55 +02001503 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001504 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001505 _PyUnicode_UTF8(unicode) = NULL;
1506 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001507 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1508#endif
1509 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1510 }
1511 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001512 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001513 return 0;
1514}
1515
Alexander Belopolsky40018472011-02-26 01:02:56 +00001516static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001517unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001518{
Walter Dörwald16807132007-05-25 13:52:07 +00001519 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001520 case SSTATE_NOT_INTERNED:
1521 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001522
Benjamin Peterson29060642009-01-31 22:14:21 +00001523 case SSTATE_INTERNED_MORTAL:
1524 /* revive dead object temporarily for DelItem */
1525 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001526 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001527 Py_FatalError(
1528 "deletion of interned string failed");
1529 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001530
Benjamin Peterson29060642009-01-31 22:14:21 +00001531 case SSTATE_INTERNED_IMMORTAL:
1532 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001533
Benjamin Peterson29060642009-01-31 22:14:21 +00001534 default:
1535 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001536 }
1537
Victor Stinner03490912011-10-03 23:45:12 +02001538 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001539 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001540 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001541 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001542 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1543 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001544
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001545 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001546}
1547
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001548#ifdef Py_DEBUG
1549static int
1550unicode_is_singleton(PyObject *unicode)
1551{
1552 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1553 if (unicode == unicode_empty)
1554 return 1;
1555 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1556 {
1557 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1558 if (ch < 256 && unicode_latin1[ch] == unicode)
1559 return 1;
1560 }
1561 return 0;
1562}
1563#endif
1564
Alexander Belopolsky40018472011-02-26 01:02:56 +00001565static int
Victor Stinner488fa492011-12-12 00:01:39 +01001566unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001567{
Victor Stinner488fa492011-12-12 00:01:39 +01001568 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001569 if (Py_REFCNT(unicode) != 1)
1570 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001571 if (_PyUnicode_HASH(unicode) != -1)
1572 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001573 if (PyUnicode_CHECK_INTERNED(unicode))
1574 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001575 if (!PyUnicode_CheckExact(unicode))
1576 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001577#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001578 /* singleton refcount is greater than 1 */
1579 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001580#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001581 return 1;
1582}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001583
Victor Stinnerfe226c02011-10-03 03:52:20 +02001584static int
1585unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1586{
1587 PyObject *unicode;
1588 Py_ssize_t old_length;
1589
1590 assert(p_unicode != NULL);
1591 unicode = *p_unicode;
1592
1593 assert(unicode != NULL);
1594 assert(PyUnicode_Check(unicode));
1595 assert(0 <= length);
1596
Victor Stinner910337b2011-10-03 03:20:16 +02001597 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001598 old_length = PyUnicode_WSTR_LENGTH(unicode);
1599 else
1600 old_length = PyUnicode_GET_LENGTH(unicode);
1601 if (old_length == length)
1602 return 0;
1603
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001604 if (length == 0) {
1605 Py_DECREF(*p_unicode);
1606 *p_unicode = unicode_empty;
1607 Py_INCREF(*p_unicode);
1608 return 0;
1609 }
1610
Victor Stinner488fa492011-12-12 00:01:39 +01001611 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001612 PyObject *copy = resize_copy(unicode, length);
1613 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001614 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001615 Py_DECREF(*p_unicode);
1616 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001617 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001618 }
1619
Victor Stinnerfe226c02011-10-03 03:52:20 +02001620 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001621 PyObject *new_unicode = resize_compact(unicode, length);
1622 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001623 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001624 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001625 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001626 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001627 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001628}
1629
Alexander Belopolsky40018472011-02-26 01:02:56 +00001630int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001631PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001632{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001633 PyObject *unicode;
1634 if (p_unicode == NULL) {
1635 PyErr_BadInternalCall();
1636 return -1;
1637 }
1638 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001639 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001640 {
1641 PyErr_BadInternalCall();
1642 return -1;
1643 }
1644 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001645}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001646
Victor Stinnerc5166102012-02-22 13:55:02 +01001647/* Copy a ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001648
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001649 WARNING: The function doesn't copy the terminating null character and
1650 doesn't check the maximum character (may write a latin1 character in an
1651 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001652static void
1653unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1654 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001655{
1656 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1657 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001658 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001659
1660 switch (kind) {
1661 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001662 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001663#ifdef Py_DEBUG
1664 if (PyUnicode_IS_ASCII(unicode)) {
1665 Py_UCS4 maxchar = ucs1lib_find_max_char(
1666 (const Py_UCS1*)str,
1667 (const Py_UCS1*)str + len);
1668 assert(maxchar < 128);
1669 }
1670#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001671 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001672 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001673 }
1674 case PyUnicode_2BYTE_KIND: {
1675 Py_UCS2 *start = (Py_UCS2 *)data + index;
1676 Py_UCS2 *ucs2 = start;
1677 assert(index <= PyUnicode_GET_LENGTH(unicode));
1678
Victor Stinner184252a2012-06-16 02:57:41 +02001679 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001680 *ucs2 = (Py_UCS2)*str;
1681
1682 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001683 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001684 }
1685 default: {
1686 Py_UCS4 *start = (Py_UCS4 *)data + index;
1687 Py_UCS4 *ucs4 = start;
1688 assert(kind == PyUnicode_4BYTE_KIND);
1689 assert(index <= PyUnicode_GET_LENGTH(unicode));
1690
Victor Stinner184252a2012-06-16 02:57:41 +02001691 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001692 *ucs4 = (Py_UCS4)*str;
1693
1694 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001695 }
1696 }
1697}
1698
1699
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001700static PyObject*
1701get_latin1_char(unsigned char ch)
1702{
Victor Stinnera464fc12011-10-02 20:39:30 +02001703 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001704 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001705 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001706 if (!unicode)
1707 return NULL;
1708 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001709 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001710 unicode_latin1[ch] = unicode;
1711 }
1712 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001713 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001714}
1715
Alexander Belopolsky40018472011-02-26 01:02:56 +00001716PyObject *
1717PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001718{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001719 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001720 Py_UCS4 maxchar = 0;
1721 Py_ssize_t num_surrogates;
1722
1723 if (u == NULL)
1724 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001725
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001726 /* If the Unicode data is known at construction time, we can apply
1727 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001728
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001729 /* Optimization for empty strings */
1730 if (size == 0 && unicode_empty != NULL) {
1731 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001732 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001733 }
Tim Petersced69f82003-09-16 20:30:58 +00001734
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001735 /* Single character Unicode objects in the Latin-1 range are
1736 shared when using this constructor */
1737 if (size == 1 && *u < 256)
1738 return get_latin1_char((unsigned char)*u);
1739
1740 /* If not empty and not single character, copy the Unicode data
1741 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001742 if (find_maxchar_surrogates(u, u + size,
1743 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001744 return NULL;
1745
Victor Stinner8faf8212011-12-08 22:14:11 +01001746 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001747 if (!unicode)
1748 return NULL;
1749
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001750 switch (PyUnicode_KIND(unicode)) {
1751 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001752 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001753 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1754 break;
1755 case PyUnicode_2BYTE_KIND:
1756#if Py_UNICODE_SIZE == 2
1757 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1758#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001759 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001760 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1761#endif
1762 break;
1763 case PyUnicode_4BYTE_KIND:
1764#if SIZEOF_WCHAR_T == 2
1765 /* This is the only case which has to process surrogates, thus
1766 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001767 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001768#else
1769 assert(num_surrogates == 0);
1770 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1771#endif
1772 break;
1773 default:
1774 assert(0 && "Impossible state");
1775 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001776
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001777 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001778}
1779
Alexander Belopolsky40018472011-02-26 01:02:56 +00001780PyObject *
1781PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001782{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001783 if (size < 0) {
1784 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001785 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001786 return NULL;
1787 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001788 if (u != NULL)
1789 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1790 else
1791 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001792}
1793
Alexander Belopolsky40018472011-02-26 01:02:56 +00001794PyObject *
1795PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001796{
1797 size_t size = strlen(u);
1798 if (size > PY_SSIZE_T_MAX) {
1799 PyErr_SetString(PyExc_OverflowError, "input too long");
1800 return NULL;
1801 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001802 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001803}
1804
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001805PyObject *
1806_PyUnicode_FromId(_Py_Identifier *id)
1807{
1808 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001809 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1810 strlen(id->string),
1811 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001812 if (!id->object)
1813 return NULL;
1814 PyUnicode_InternInPlace(&id->object);
1815 assert(!id->next);
1816 id->next = static_strings;
1817 static_strings = id;
1818 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001819 return id->object;
1820}
1821
1822void
1823_PyUnicode_ClearStaticStrings()
1824{
1825 _Py_Identifier *i;
1826 for (i = static_strings; i; i = i->next) {
1827 Py_DECREF(i->object);
1828 i->object = NULL;
1829 i->next = NULL;
1830 }
1831}
1832
Benjamin Peterson0df54292012-03-26 14:50:32 -04001833/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001834
Victor Stinnerd3f08822012-05-29 12:57:52 +02001835PyObject*
1836_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001837{
Victor Stinnerd3f08822012-05-29 12:57:52 +02001838 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01001839 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001840 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001841#ifdef Py_DEBUG
Victor Stinnere6b2d442011-12-11 21:54:30 +01001842 assert(s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001843#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001844 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001845 }
Victor Stinner785938e2011-12-11 20:09:03 +01001846 unicode = PyUnicode_New(size, 127);
1847 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001848 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001849 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1850 assert(_PyUnicode_CheckConsistency(unicode, 1));
1851 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001852}
1853
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001854static Py_UCS4
1855kind_maxchar_limit(unsigned int kind)
1856{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001857 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001858 case PyUnicode_1BYTE_KIND:
1859 return 0x80;
1860 case PyUnicode_2BYTE_KIND:
1861 return 0x100;
1862 case PyUnicode_4BYTE_KIND:
1863 return 0x10000;
1864 default:
1865 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001866 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001867 }
1868}
1869
Victor Stinnere6abb482012-05-02 01:15:40 +02001870Py_LOCAL_INLINE(Py_UCS4)
1871align_maxchar(Py_UCS4 maxchar)
1872{
1873 if (maxchar <= 127)
1874 return 127;
1875 else if (maxchar <= 255)
1876 return 255;
1877 else if (maxchar <= 65535)
1878 return 65535;
1879 else
1880 return MAX_UNICODE;
1881}
1882
Victor Stinner702c7342011-10-05 13:50:52 +02001883static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001884_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001885{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001886 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001887 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001888
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001889 if (size == 0) {
1890 Py_INCREF(unicode_empty);
1891 return unicode_empty;
1892 }
1893 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001894 if (size == 1)
1895 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001896
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001897 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001898 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001899 if (!res)
1900 return NULL;
1901 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001902 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001903 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001904}
1905
Victor Stinnere57b1c02011-09-28 22:20:48 +02001906static PyObject*
1907_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001908{
1909 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001910 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001911
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001912 if (size == 0) {
1913 Py_INCREF(unicode_empty);
1914 return unicode_empty;
1915 }
1916 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001917 if (size == 1) {
1918 Py_UCS4 ch = u[0];
1919 if (ch < 256)
1920 return get_latin1_char((unsigned char)ch);
1921
1922 res = PyUnicode_New(1, ch);
1923 if (res == NULL)
1924 return NULL;
1925 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1926 assert(_PyUnicode_CheckConsistency(res, 1));
1927 return res;
1928 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001929
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001930 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001931 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001932 if (!res)
1933 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001934 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001935 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001936 else {
1937 _PyUnicode_CONVERT_BYTES(
1938 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1939 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001940 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001941 return res;
1942}
1943
Victor Stinnere57b1c02011-09-28 22:20:48 +02001944static PyObject*
1945_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001946{
1947 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001948 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001949
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001950 if (size == 0) {
1951 Py_INCREF(unicode_empty);
1952 return unicode_empty;
1953 }
1954 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001955 if (size == 1) {
1956 Py_UCS4 ch = u[0];
1957 if (ch < 256)
1958 return get_latin1_char((unsigned char)ch);
1959
1960 res = PyUnicode_New(1, ch);
1961 if (res == NULL)
1962 return NULL;
1963 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1964 assert(_PyUnicode_CheckConsistency(res, 1));
1965 return res;
1966 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001967
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001968 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001969 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001970 if (!res)
1971 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001972 if (max_char < 256)
1973 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1974 PyUnicode_1BYTE_DATA(res));
1975 else if (max_char < 0x10000)
1976 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1977 PyUnicode_2BYTE_DATA(res));
1978 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001979 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001980 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001981 return res;
1982}
1983
1984PyObject*
1985PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1986{
Victor Stinnercfed46e2011-11-22 01:29:14 +01001987 if (size < 0) {
1988 PyErr_SetString(PyExc_ValueError, "size must be positive");
1989 return NULL;
1990 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06001991 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001992 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001993 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001994 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001995 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001996 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001997 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001998 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02001999 PyErr_SetString(PyExc_SystemError, "invalid kind");
2000 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002001 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002002}
2003
Victor Stinnerece58de2012-04-23 23:36:38 +02002004Py_UCS4
2005_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2006{
2007 enum PyUnicode_Kind kind;
2008 void *startptr, *endptr;
2009
2010 assert(PyUnicode_IS_READY(unicode));
2011 assert(0 <= start);
2012 assert(end <= PyUnicode_GET_LENGTH(unicode));
2013 assert(start <= end);
2014
2015 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2016 return PyUnicode_MAX_CHAR_VALUE(unicode);
2017
2018 if (start == end)
2019 return 127;
2020
Victor Stinner94d558b2012-04-27 22:26:58 +02002021 if (PyUnicode_IS_ASCII(unicode))
2022 return 127;
2023
Victor Stinnerece58de2012-04-23 23:36:38 +02002024 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002025 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002026 endptr = (char *)startptr + end * kind;
2027 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002028 switch(kind) {
2029 case PyUnicode_1BYTE_KIND:
2030 return ucs1lib_find_max_char(startptr, endptr);
2031 case PyUnicode_2BYTE_KIND:
2032 return ucs2lib_find_max_char(startptr, endptr);
2033 case PyUnicode_4BYTE_KIND:
2034 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002035 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002036 assert(0);
2037 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002038 }
2039}
2040
Victor Stinner25a4b292011-10-06 12:31:55 +02002041/* Ensure that a string uses the most efficient storage, if it is not the
2042 case: create a new string with of the right kind. Write NULL into *p_unicode
2043 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002044static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002045unicode_adjust_maxchar(PyObject **p_unicode)
2046{
2047 PyObject *unicode, *copy;
2048 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002049 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002050 unsigned int kind;
2051
2052 assert(p_unicode != NULL);
2053 unicode = *p_unicode;
2054 assert(PyUnicode_IS_READY(unicode));
2055 if (PyUnicode_IS_ASCII(unicode))
2056 return;
2057
2058 len = PyUnicode_GET_LENGTH(unicode);
2059 kind = PyUnicode_KIND(unicode);
2060 if (kind == PyUnicode_1BYTE_KIND) {
2061 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002062 max_char = ucs1lib_find_max_char(u, u + len);
2063 if (max_char >= 128)
2064 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002065 }
2066 else if (kind == PyUnicode_2BYTE_KIND) {
2067 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002068 max_char = ucs2lib_find_max_char(u, u + len);
2069 if (max_char >= 256)
2070 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002071 }
2072 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002073 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002074 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002075 max_char = ucs4lib_find_max_char(u, u + len);
2076 if (max_char >= 0x10000)
2077 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002078 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002079 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002080 if (copy != NULL)
2081 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002082 Py_DECREF(unicode);
2083 *p_unicode = copy;
2084}
2085
Victor Stinner034f6cf2011-09-30 02:26:44 +02002086PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002087_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002088{
Victor Stinner87af4f22011-11-21 23:03:47 +01002089 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002090 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002091
Victor Stinner034f6cf2011-09-30 02:26:44 +02002092 if (!PyUnicode_Check(unicode)) {
2093 PyErr_BadInternalCall();
2094 return NULL;
2095 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002096 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002097 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002098
Victor Stinner87af4f22011-11-21 23:03:47 +01002099 length = PyUnicode_GET_LENGTH(unicode);
2100 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002101 if (!copy)
2102 return NULL;
2103 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2104
Victor Stinner87af4f22011-11-21 23:03:47 +01002105 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2106 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002107 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002108 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002109}
2110
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002111
Victor Stinnerbc603d12011-10-02 01:00:40 +02002112/* Widen Unicode objects to larger buffers. Don't write terminating null
2113 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002114
2115void*
2116_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2117{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002118 Py_ssize_t len;
2119 void *result;
2120 unsigned int skind;
2121
Benjamin Petersonbac79492012-01-14 13:34:47 -05002122 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002123 return NULL;
2124
2125 len = PyUnicode_GET_LENGTH(s);
2126 skind = PyUnicode_KIND(s);
2127 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002128 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002129 return NULL;
2130 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002131 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002132 case PyUnicode_2BYTE_KIND:
2133 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2134 if (!result)
2135 return PyErr_NoMemory();
2136 assert(skind == PyUnicode_1BYTE_KIND);
2137 _PyUnicode_CONVERT_BYTES(
2138 Py_UCS1, Py_UCS2,
2139 PyUnicode_1BYTE_DATA(s),
2140 PyUnicode_1BYTE_DATA(s) + len,
2141 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002142 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002143 case PyUnicode_4BYTE_KIND:
2144 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2145 if (!result)
2146 return PyErr_NoMemory();
2147 if (skind == PyUnicode_2BYTE_KIND) {
2148 _PyUnicode_CONVERT_BYTES(
2149 Py_UCS2, Py_UCS4,
2150 PyUnicode_2BYTE_DATA(s),
2151 PyUnicode_2BYTE_DATA(s) + len,
2152 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002153 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002154 else {
2155 assert(skind == PyUnicode_1BYTE_KIND);
2156 _PyUnicode_CONVERT_BYTES(
2157 Py_UCS1, Py_UCS4,
2158 PyUnicode_1BYTE_DATA(s),
2159 PyUnicode_1BYTE_DATA(s) + len,
2160 result);
2161 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002162 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002163 default:
2164 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002165 }
Victor Stinner01698042011-10-04 00:04:26 +02002166 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002167 return NULL;
2168}
2169
2170static Py_UCS4*
2171as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2172 int copy_null)
2173{
2174 int kind;
2175 void *data;
2176 Py_ssize_t len, targetlen;
2177 if (PyUnicode_READY(string) == -1)
2178 return NULL;
2179 kind = PyUnicode_KIND(string);
2180 data = PyUnicode_DATA(string);
2181 len = PyUnicode_GET_LENGTH(string);
2182 targetlen = len;
2183 if (copy_null)
2184 targetlen++;
2185 if (!target) {
2186 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2187 PyErr_NoMemory();
2188 return NULL;
2189 }
2190 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2191 if (!target) {
2192 PyErr_NoMemory();
2193 return NULL;
2194 }
2195 }
2196 else {
2197 if (targetsize < targetlen) {
2198 PyErr_Format(PyExc_SystemError,
2199 "string is longer than the buffer");
2200 if (copy_null && 0 < targetsize)
2201 target[0] = 0;
2202 return NULL;
2203 }
2204 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002205 if (kind == PyUnicode_1BYTE_KIND) {
2206 Py_UCS1 *start = (Py_UCS1 *) data;
2207 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002208 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002209 else if (kind == PyUnicode_2BYTE_KIND) {
2210 Py_UCS2 *start = (Py_UCS2 *) data;
2211 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2212 }
2213 else {
2214 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002215 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002216 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002217 if (copy_null)
2218 target[len] = 0;
2219 return target;
2220}
2221
2222Py_UCS4*
2223PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2224 int copy_null)
2225{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002226 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002227 PyErr_BadInternalCall();
2228 return NULL;
2229 }
2230 return as_ucs4(string, target, targetsize, copy_null);
2231}
2232
2233Py_UCS4*
2234PyUnicode_AsUCS4Copy(PyObject *string)
2235{
2236 return as_ucs4(string, NULL, 0, 1);
2237}
2238
2239#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002240
Alexander Belopolsky40018472011-02-26 01:02:56 +00002241PyObject *
2242PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002243{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002244 if (w == NULL) {
Victor Stinner382955f2011-12-11 21:44:00 +01002245 if (size == 0) {
2246 Py_INCREF(unicode_empty);
2247 return unicode_empty;
2248 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002249 PyErr_BadInternalCall();
2250 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002251 }
2252
Martin v. Löwis790465f2008-04-05 20:41:37 +00002253 if (size == -1) {
2254 size = wcslen(w);
2255 }
2256
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002257 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002258}
2259
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002260#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002261
Walter Dörwald346737f2007-05-31 10:44:43 +00002262static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002263makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
Victor Stinnere215d962012-10-06 23:03:36 +02002264 char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002265{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002266 *fmt++ = '%';
Benjamin Peterson14339b62009-01-31 16:36:08 +00002267 if (longflag)
2268 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002269 else if (longlongflag) {
2270 /* longlongflag should only ever be nonzero on machines with
2271 HAVE_LONG_LONG defined */
2272#ifdef HAVE_LONG_LONG
2273 char *f = PY_FORMAT_LONG_LONG;
2274 while (*f)
2275 *fmt++ = *f++;
2276#else
2277 /* we shouldn't ever get here */
2278 assert(0);
2279 *fmt++ = 'l';
2280#endif
2281 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002282 else if (size_tflag) {
2283 char *f = PY_FORMAT_SIZE_T;
2284 while (*f)
2285 *fmt++ = *f++;
2286 }
2287 *fmt++ = c;
2288 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002289}
2290
Victor Stinner15a11362012-10-06 23:48:20 +02002291/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002292 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2293 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2294#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002295
2296static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002297unicode_fromformat_arg(_PyUnicodeWriter *writer,
2298 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002299{
Victor Stinnere215d962012-10-06 23:03:36 +02002300 const char *p;
2301 Py_ssize_t len;
2302 int zeropad;
2303 int width;
2304 int precision;
2305 int longflag;
2306 int longlongflag;
2307 int size_tflag;
2308 int fill;
2309
2310 p = f;
2311 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002312 zeropad = 0;
2313 if (*f == '0') {
2314 zeropad = 1;
2315 f++;
2316 }
Victor Stinner96865452011-03-01 23:44:09 +00002317
2318 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner96865452011-03-01 23:44:09 +00002319 width = 0;
Victor Stinnere215d962012-10-06 23:03:36 +02002320 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner3921e902012-10-06 23:05:00 +02002321 if (width > (INT_MAX - ((int)*f - '0')) / 10) {
2322 PyErr_SetString(PyExc_ValueError,
2323 "width too big");
2324 return NULL;
2325 }
Victor Stinnere215d962012-10-06 23:03:36 +02002326 width = (width*10) + (*f - '0');
2327 f++;
2328 }
Victor Stinner96865452011-03-01 23:44:09 +00002329 precision = 0;
2330 if (*f == '.') {
2331 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002332 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner3921e902012-10-06 23:05:00 +02002333 if (precision > (INT_MAX - ((int)*f - '0')) / 10) {
2334 PyErr_SetString(PyExc_ValueError,
2335 "precision too big");
2336 return NULL;
2337 }
Victor Stinnere215d962012-10-06 23:03:36 +02002338 precision = (precision*10) + (*f - '0');
2339 f++;
2340 }
Victor Stinner96865452011-03-01 23:44:09 +00002341 if (*f == '%') {
2342 /* "%.3%s" => f points to "3" */
2343 f--;
2344 }
2345 }
2346 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002347 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002348 f--;
2349 }
Victor Stinner96865452011-03-01 23:44:09 +00002350
2351 /* Handle %ld, %lu, %lld and %llu. */
2352 longflag = 0;
2353 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002354 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002355 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002356 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002357 longflag = 1;
2358 ++f;
2359 }
2360#ifdef HAVE_LONG_LONG
2361 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002362 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002363 longlongflag = 1;
2364 f += 2;
2365 }
2366#endif
2367 }
2368 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002369 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002370 size_tflag = 1;
2371 ++f;
2372 }
Victor Stinnere215d962012-10-06 23:03:36 +02002373
2374 if (f[1] == '\0')
2375 writer->overallocate = 0;
2376
2377 switch (*f) {
2378 case 'c':
2379 {
2380 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002381 if (ordinal < 0 || ordinal > MAX_UNICODE) {
2382 PyErr_SetString(PyExc_ValueError,
2383 "character argument not in range(0x110000)");
2384 return NULL;
2385 }
Victor Stinnere215d962012-10-06 23:03:36 +02002386 if (_PyUnicodeWriter_Prepare(writer, 1, ordinal) == -1)
2387 return NULL;
2388 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ordinal);
2389 writer->pos++;
2390 break;
2391 }
2392
2393 case 'i':
2394 case 'd':
2395 case 'u':
2396 case 'x':
2397 {
2398 /* used by sprintf */
2399 char fmt[10]; /* should be enough for "%0lld\0" */
Victor Stinner15a11362012-10-06 23:48:20 +02002400 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinnere215d962012-10-06 23:03:36 +02002401
2402 if (*f == 'u') {
2403 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2404
2405 if (longflag)
2406 len = sprintf(buffer, fmt,
2407 va_arg(*vargs, unsigned long));
2408#ifdef HAVE_LONG_LONG
2409 else if (longlongflag)
2410 len = sprintf(buffer, fmt,
2411 va_arg(*vargs, unsigned PY_LONG_LONG));
2412#endif
2413 else if (size_tflag)
2414 len = sprintf(buffer, fmt,
2415 va_arg(*vargs, size_t));
2416 else
2417 len = sprintf(buffer, fmt,
2418 va_arg(*vargs, unsigned int));
2419 }
2420 else if (*f == 'x') {
2421 makefmt(fmt, 0, 0, 0, 'x');
2422 len = sprintf(buffer, fmt, va_arg(*vargs, int));
2423 }
2424 else {
2425 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2426
2427 if (longflag)
2428 len = sprintf(buffer, fmt,
2429 va_arg(*vargs, long));
2430#ifdef HAVE_LONG_LONG
2431 else if (longlongflag)
2432 len = sprintf(buffer, fmt,
2433 va_arg(*vargs, PY_LONG_LONG));
2434#endif
2435 else if (size_tflag)
2436 len = sprintf(buffer, fmt,
2437 va_arg(*vargs, Py_ssize_t));
2438 else
2439 len = sprintf(buffer, fmt,
2440 va_arg(*vargs, int));
2441 }
2442 assert(len >= 0);
2443
Victor Stinnere215d962012-10-06 23:03:36 +02002444 if (precision < len)
2445 precision = len;
2446 if (width > precision) {
2447 Py_UCS4 fillchar;
2448 fill = width - precision;
2449 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002450 if (_PyUnicodeWriter_Prepare(writer, fill, fillchar) == -1)
2451 return NULL;
2452 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2453 return NULL;
2454 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002455 }
Victor Stinner15a11362012-10-06 23:48:20 +02002456 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002457 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002458 if (_PyUnicodeWriter_Prepare(writer, fill, '0') == -1)
2459 return NULL;
2460 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2461 return NULL;
2462 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002463 }
Victor Stinner15a11362012-10-06 23:48:20 +02002464 if (_PyUnicodeWriter_WriteCstr(writer, buffer, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002465 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002466 break;
2467 }
2468
2469 case 'p':
2470 {
2471 char number[MAX_LONG_LONG_CHARS];
2472
2473 len = sprintf(number, "%p", va_arg(*vargs, void*));
2474 assert(len >= 0);
2475
2476 /* %p is ill-defined: ensure leading 0x. */
2477 if (number[1] == 'X')
2478 number[1] = 'x';
2479 else if (number[1] != 'x') {
2480 memmove(number + 2, number,
2481 strlen(number) + 1);
2482 number[0] = '0';
2483 number[1] = 'x';
2484 len += 2;
2485 }
2486
2487 if (_PyUnicodeWriter_WriteCstr(writer, number, len) == -1)
2488 return NULL;
2489 break;
2490 }
2491
2492 case 's':
2493 {
2494 /* UTF-8 */
2495 const char *s = va_arg(*vargs, const char*);
2496 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
2497 if (!str)
2498 return NULL;
2499 if (_PyUnicodeWriter_WriteStr(writer, str) == -1) {
2500 Py_DECREF(str);
2501 return NULL;
2502 }
2503 Py_DECREF(str);
2504 break;
2505 }
2506
2507 case 'U':
2508 {
2509 PyObject *obj = va_arg(*vargs, PyObject *);
2510 assert(obj && _PyUnicode_CHECK(obj));
2511
2512 if (_PyUnicodeWriter_WriteStr(writer, obj) == -1)
2513 return NULL;
2514 break;
2515 }
2516
2517 case 'V':
2518 {
2519 PyObject *obj = va_arg(*vargs, PyObject *);
2520 const char *str = va_arg(*vargs, const char *);
2521 PyObject *str_obj;
2522 assert(obj || str);
2523 if (obj) {
2524 assert(_PyUnicode_CHECK(obj));
2525 if (_PyUnicodeWriter_WriteStr(writer, obj) == -1)
2526 return NULL;
2527 }
2528 else {
2529 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
2530 if (!str_obj)
2531 return NULL;
2532 if (_PyUnicodeWriter_WriteStr(writer, str_obj) == -1) {
2533 Py_DECREF(str_obj);
2534 return NULL;
2535 }
2536 Py_DECREF(str_obj);
2537 }
2538 break;
2539 }
2540
2541 case 'S':
2542 {
2543 PyObject *obj = va_arg(*vargs, PyObject *);
2544 PyObject *str;
2545 assert(obj);
2546 str = PyObject_Str(obj);
2547 if (!str)
2548 return NULL;
2549 if (_PyUnicodeWriter_WriteStr(writer, str) == -1) {
2550 Py_DECREF(str);
2551 return NULL;
2552 }
2553 Py_DECREF(str);
2554 break;
2555 }
2556
2557 case 'R':
2558 {
2559 PyObject *obj = va_arg(*vargs, PyObject *);
2560 PyObject *repr;
2561 assert(obj);
2562 repr = PyObject_Repr(obj);
2563 if (!repr)
2564 return NULL;
2565 if (_PyUnicodeWriter_WriteStr(writer, repr) == -1) {
2566 Py_DECREF(repr);
2567 return NULL;
2568 }
2569 Py_DECREF(repr);
2570 break;
2571 }
2572
2573 case 'A':
2574 {
2575 PyObject *obj = va_arg(*vargs, PyObject *);
2576 PyObject *ascii;
2577 assert(obj);
2578 ascii = PyObject_ASCII(obj);
2579 if (!ascii)
2580 return NULL;
2581 if (_PyUnicodeWriter_WriteStr(writer, ascii) == -1) {
2582 Py_DECREF(ascii);
2583 return NULL;
2584 }
2585 Py_DECREF(ascii);
2586 break;
2587 }
2588
2589 case '%':
2590 if (_PyUnicodeWriter_Prepare(writer, 1, '%') == 1)
2591 return NULL;
2592 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '%');
2593 writer->pos++;
2594 break;
2595
2596 default:
2597 /* if we stumble upon an unknown formatting code, copy the rest
2598 of the format string to the output string. (we cannot just
2599 skip the code, since there's no way to know what's in the
2600 argument list) */
2601 len = strlen(p);
2602 if (_PyUnicodeWriter_WriteCstr(writer, p, len) == -1)
2603 return NULL;
2604 f = p+len;
2605 return f;
2606 }
2607
2608 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002609 return f;
2610}
2611
Walter Dörwaldd2034312007-05-18 16:29:38 +00002612PyObject *
2613PyUnicode_FromFormatV(const char *format, va_list vargs)
2614{
Victor Stinnere215d962012-10-06 23:03:36 +02002615 va_list vargs2;
2616 const char *f;
2617 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002618
Victor Stinnere215d962012-10-06 23:03:36 +02002619 _PyUnicodeWriter_Init(&writer, strlen(format) + 100);
2620
2621 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2622 Copy it to be able to pass a reference to a subfunction. */
2623 Py_VA_COPY(vargs2, vargs);
2624
2625 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002626 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002627 f = unicode_fromformat_arg(&writer, f, &vargs2);
2628 if (f == NULL)
2629 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002630 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002631 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002632 const char *p;
2633 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002634
Victor Stinnere215d962012-10-06 23:03:36 +02002635 p = f;
2636 do
2637 {
2638 if ((unsigned char)*p > 127) {
2639 PyErr_Format(PyExc_ValueError,
2640 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2641 "string, got a non-ASCII byte: 0x%02x",
2642 (unsigned char)*p);
2643 return NULL;
2644 }
2645 p++;
2646 }
2647 while (*p != '\0' && *p != '%');
2648 len = p - f;
2649
2650 if (*p == '\0')
2651 writer.overallocate = 0;
2652 if (_PyUnicodeWriter_Prepare(&writer, len, 127) == -1)
2653 goto fail;
2654 unicode_write_cstr(writer.buffer, writer.pos, f, len);
2655 writer.pos += len;
2656
2657 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002658 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002659 }
Victor Stinnere215d962012-10-06 23:03:36 +02002660 return _PyUnicodeWriter_Finish(&writer);
2661
2662 fail:
2663 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002664 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002665}
2666
Walter Dörwaldd2034312007-05-18 16:29:38 +00002667PyObject *
2668PyUnicode_FromFormat(const char *format, ...)
2669{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002670 PyObject* ret;
2671 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002672
2673#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002674 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002675#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002676 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002677#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002678 ret = PyUnicode_FromFormatV(format, vargs);
2679 va_end(vargs);
2680 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002681}
2682
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002683#ifdef HAVE_WCHAR_H
2684
Victor Stinner5593d8a2010-10-02 11:11:27 +00002685/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2686 convert a Unicode object to a wide character string.
2687
Victor Stinnerd88d9832011-09-06 02:00:05 +02002688 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002689 character) required to convert the unicode object. Ignore size argument.
2690
Victor Stinnerd88d9832011-09-06 02:00:05 +02002691 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002692 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002693 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002694static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002695unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002696 wchar_t *w,
2697 Py_ssize_t size)
2698{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002699 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002700 const wchar_t *wstr;
2701
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002702 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002703 if (wstr == NULL)
2704 return -1;
2705
Victor Stinner5593d8a2010-10-02 11:11:27 +00002706 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002707 if (size > res)
2708 size = res + 1;
2709 else
2710 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002711 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002712 return res;
2713 }
2714 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002715 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002716}
2717
2718Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002719PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002720 wchar_t *w,
2721 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002722{
2723 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002724 PyErr_BadInternalCall();
2725 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002726 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002727 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002728}
2729
Victor Stinner137c34c2010-09-29 10:25:54 +00002730wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002731PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002732 Py_ssize_t *size)
2733{
2734 wchar_t* buffer;
2735 Py_ssize_t buflen;
2736
2737 if (unicode == NULL) {
2738 PyErr_BadInternalCall();
2739 return NULL;
2740 }
2741
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002742 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002743 if (buflen == -1)
2744 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002745 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002746 PyErr_NoMemory();
2747 return NULL;
2748 }
2749
Victor Stinner137c34c2010-09-29 10:25:54 +00002750 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2751 if (buffer == NULL) {
2752 PyErr_NoMemory();
2753 return NULL;
2754 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002755 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02002756 if (buflen == -1) {
2757 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002758 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02002759 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00002760 if (size != NULL)
2761 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002762 return buffer;
2763}
2764
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002765#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002766
Alexander Belopolsky40018472011-02-26 01:02:56 +00002767PyObject *
2768PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002769{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002770 PyObject *v;
Victor Stinner8faf8212011-12-08 22:14:11 +01002771 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002772 PyErr_SetString(PyExc_ValueError,
2773 "chr() arg not in range(0x110000)");
2774 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002775 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002776
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002777 if (ordinal < 256)
2778 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002779
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002780 v = PyUnicode_New(1, ordinal);
2781 if (v == NULL)
2782 return NULL;
2783 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002784 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002785 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002786}
2787
Alexander Belopolsky40018472011-02-26 01:02:56 +00002788PyObject *
2789PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002790{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002791 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002792 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002793 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002794 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002795 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002796 Py_INCREF(obj);
2797 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002798 }
2799 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002800 /* For a Unicode subtype that's not a Unicode object,
2801 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002802 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002803 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002804 PyErr_Format(PyExc_TypeError,
2805 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002806 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002807 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002808}
2809
Alexander Belopolsky40018472011-02-26 01:02:56 +00002810PyObject *
2811PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002812 const char *encoding,
2813 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002814{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002815 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002816 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002817
Guido van Rossumd57fd912000-03-10 22:53:23 +00002818 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002819 PyErr_BadInternalCall();
2820 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002821 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002822
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002823 /* Decoding bytes objects is the most common case and should be fast */
2824 if (PyBytes_Check(obj)) {
2825 if (PyBytes_GET_SIZE(obj) == 0) {
2826 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002827 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002828 }
2829 else {
2830 v = PyUnicode_Decode(
2831 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2832 encoding, errors);
2833 }
2834 return v;
2835 }
2836
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002837 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002838 PyErr_SetString(PyExc_TypeError,
2839 "decoding str is not supported");
2840 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002841 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002842
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002843 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2844 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2845 PyErr_Format(PyExc_TypeError,
2846 "coercing to str: need bytes, bytearray "
2847 "or buffer-like object, %.80s found",
2848 Py_TYPE(obj)->tp_name);
2849 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002850 }
Tim Petersced69f82003-09-16 20:30:58 +00002851
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002852 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002853 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002854 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002855 }
Tim Petersced69f82003-09-16 20:30:58 +00002856 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002857 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002858
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002859 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002860 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002861}
2862
Victor Stinner600d3be2010-06-10 12:00:55 +00002863/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002864 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2865 1 on success. */
2866static int
2867normalize_encoding(const char *encoding,
2868 char *lower,
2869 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002870{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002871 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002872 char *l;
2873 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002874
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002875 if (encoding == NULL) {
2876 strcpy(lower, "utf-8");
2877 return 1;
2878 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002879 e = encoding;
2880 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002881 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002882 while (*e) {
2883 if (l == l_end)
2884 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002885 if (Py_ISUPPER(*e)) {
2886 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002887 }
2888 else if (*e == '_') {
2889 *l++ = '-';
2890 e++;
2891 }
2892 else {
2893 *l++ = *e++;
2894 }
2895 }
2896 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002897 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002898}
2899
Alexander Belopolsky40018472011-02-26 01:02:56 +00002900PyObject *
2901PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002902 Py_ssize_t size,
2903 const char *encoding,
2904 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002905{
2906 PyObject *buffer = NULL, *unicode;
2907 Py_buffer info;
2908 char lower[11]; /* Enough for any encoding shortcut */
2909
Fred Drakee4315f52000-05-09 19:53:39 +00002910 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002911 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002912 if ((strcmp(lower, "utf-8") == 0) ||
2913 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002914 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00002915 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002916 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002917 (strcmp(lower, "iso-8859-1") == 0))
2918 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002919#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002920 else if (strcmp(lower, "mbcs") == 0)
2921 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002922#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002923 else if (strcmp(lower, "ascii") == 0)
2924 return PyUnicode_DecodeASCII(s, size, errors);
2925 else if (strcmp(lower, "utf-16") == 0)
2926 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2927 else if (strcmp(lower, "utf-32") == 0)
2928 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2929 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002930
2931 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002932 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002933 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002934 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002935 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002936 if (buffer == NULL)
2937 goto onError;
2938 unicode = PyCodec_Decode(buffer, encoding, errors);
2939 if (unicode == NULL)
2940 goto onError;
2941 if (!PyUnicode_Check(unicode)) {
2942 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002943 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002944 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002945 Py_DECREF(unicode);
2946 goto onError;
2947 }
2948 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002949 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00002950
Benjamin Peterson29060642009-01-31 22:14:21 +00002951 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002952 Py_XDECREF(buffer);
2953 return NULL;
2954}
2955
Alexander Belopolsky40018472011-02-26 01:02:56 +00002956PyObject *
2957PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002958 const char *encoding,
2959 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002960{
2961 PyObject *v;
2962
2963 if (!PyUnicode_Check(unicode)) {
2964 PyErr_BadArgument();
2965 goto onError;
2966 }
2967
2968 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002969 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002970
2971 /* Decode via the codec registry */
2972 v = PyCodec_Decode(unicode, encoding, errors);
2973 if (v == NULL)
2974 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002975 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002976
Benjamin Peterson29060642009-01-31 22:14:21 +00002977 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002978 return NULL;
2979}
2980
Alexander Belopolsky40018472011-02-26 01:02:56 +00002981PyObject *
2982PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002983 const char *encoding,
2984 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002985{
2986 PyObject *v;
2987
2988 if (!PyUnicode_Check(unicode)) {
2989 PyErr_BadArgument();
2990 goto onError;
2991 }
2992
2993 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002994 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002995
2996 /* Decode via the codec registry */
2997 v = PyCodec_Decode(unicode, encoding, errors);
2998 if (v == NULL)
2999 goto onError;
3000 if (!PyUnicode_Check(v)) {
3001 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003002 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003003 Py_TYPE(v)->tp_name);
3004 Py_DECREF(v);
3005 goto onError;
3006 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003007 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003008
Benjamin Peterson29060642009-01-31 22:14:21 +00003009 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003010 return NULL;
3011}
3012
Alexander Belopolsky40018472011-02-26 01:02:56 +00003013PyObject *
3014PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003015 Py_ssize_t size,
3016 const char *encoding,
3017 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003018{
3019 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003020
Guido van Rossumd57fd912000-03-10 22:53:23 +00003021 unicode = PyUnicode_FromUnicode(s, size);
3022 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003023 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003024 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3025 Py_DECREF(unicode);
3026 return v;
3027}
3028
Alexander Belopolsky40018472011-02-26 01:02:56 +00003029PyObject *
3030PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003031 const char *encoding,
3032 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003033{
3034 PyObject *v;
3035
3036 if (!PyUnicode_Check(unicode)) {
3037 PyErr_BadArgument();
3038 goto onError;
3039 }
3040
3041 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003042 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003043
3044 /* Encode via the codec registry */
3045 v = PyCodec_Encode(unicode, encoding, errors);
3046 if (v == NULL)
3047 goto onError;
3048 return v;
3049
Benjamin Peterson29060642009-01-31 22:14:21 +00003050 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003051 return NULL;
3052}
3053
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003054static size_t
3055wcstombs_errorpos(const wchar_t *wstr)
3056{
3057 size_t len;
3058#if SIZEOF_WCHAR_T == 2
3059 wchar_t buf[3];
3060#else
3061 wchar_t buf[2];
3062#endif
3063 char outbuf[MB_LEN_MAX];
3064 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003065
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003066#if SIZEOF_WCHAR_T == 2
3067 buf[2] = 0;
3068#else
3069 buf[1] = 0;
3070#endif
3071 start = wstr;
3072 while (*wstr != L'\0')
3073 {
3074 previous = wstr;
3075#if SIZEOF_WCHAR_T == 2
3076 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3077 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3078 {
3079 buf[0] = wstr[0];
3080 buf[1] = wstr[1];
3081 wstr += 2;
3082 }
3083 else {
3084 buf[0] = *wstr;
3085 buf[1] = 0;
3086 wstr++;
3087 }
3088#else
3089 buf[0] = *wstr;
3090 wstr++;
3091#endif
3092 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003093 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003094 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003095 }
3096
3097 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003098 return 0;
3099}
3100
Victor Stinner1b579672011-12-17 05:47:23 +01003101static int
3102locale_error_handler(const char *errors, int *surrogateescape)
3103{
3104 if (errors == NULL) {
3105 *surrogateescape = 0;
3106 return 0;
3107 }
3108
3109 if (strcmp(errors, "strict") == 0) {
3110 *surrogateescape = 0;
3111 return 0;
3112 }
3113 if (strcmp(errors, "surrogateescape") == 0) {
3114 *surrogateescape = 1;
3115 return 0;
3116 }
3117 PyErr_Format(PyExc_ValueError,
3118 "only 'strict' and 'surrogateescape' error handlers "
3119 "are supported, not '%s'",
3120 errors);
3121 return -1;
3122}
3123
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003124PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003125PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003126{
3127 Py_ssize_t wlen, wlen2;
3128 wchar_t *wstr;
3129 PyObject *bytes = NULL;
3130 char *errmsg;
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003131 PyObject *reason;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003132 PyObject *exc;
3133 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003134 int surrogateescape;
3135
3136 if (locale_error_handler(errors, &surrogateescape) < 0)
3137 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003138
3139 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3140 if (wstr == NULL)
3141 return NULL;
3142
3143 wlen2 = wcslen(wstr);
3144 if (wlen2 != wlen) {
3145 PyMem_Free(wstr);
3146 PyErr_SetString(PyExc_TypeError, "embedded null character");
3147 return NULL;
3148 }
3149
3150 if (surrogateescape) {
3151 /* locale encoding with surrogateescape */
3152 char *str;
3153
3154 str = _Py_wchar2char(wstr, &error_pos);
3155 if (str == NULL) {
3156 if (error_pos == (size_t)-1) {
3157 PyErr_NoMemory();
3158 PyMem_Free(wstr);
3159 return NULL;
3160 }
3161 else {
3162 goto encode_error;
3163 }
3164 }
3165 PyMem_Free(wstr);
3166
3167 bytes = PyBytes_FromString(str);
3168 PyMem_Free(str);
3169 }
3170 else {
3171 size_t len, len2;
3172
3173 len = wcstombs(NULL, wstr, 0);
3174 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003175 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003176 goto encode_error;
3177 }
3178
3179 bytes = PyBytes_FromStringAndSize(NULL, len);
3180 if (bytes == NULL) {
3181 PyMem_Free(wstr);
3182 return NULL;
3183 }
3184
3185 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3186 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003187 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003188 goto encode_error;
3189 }
3190 PyMem_Free(wstr);
3191 }
3192 return bytes;
3193
3194encode_error:
3195 errmsg = strerror(errno);
3196 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003197
3198 if (error_pos == (size_t)-1)
3199 error_pos = wcstombs_errorpos(wstr);
3200
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003201 PyMem_Free(wstr);
3202 Py_XDECREF(bytes);
3203
Victor Stinner2f197072011-12-17 07:08:30 +01003204 if (errmsg != NULL) {
3205 size_t errlen;
3206 wstr = _Py_char2wchar(errmsg, &errlen);
3207 if (wstr != NULL) {
3208 reason = PyUnicode_FromWideChar(wstr, errlen);
3209 PyMem_Free(wstr);
3210 } else
3211 errmsg = NULL;
3212 }
3213 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003214 reason = PyUnicode_FromString(
3215 "wcstombs() encountered an unencodable "
3216 "wide character");
3217 if (reason == NULL)
3218 return NULL;
3219
3220 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3221 "locale", unicode,
3222 (Py_ssize_t)error_pos,
3223 (Py_ssize_t)(error_pos+1),
3224 reason);
3225 Py_DECREF(reason);
3226 if (exc != NULL) {
3227 PyCodec_StrictErrors(exc);
3228 Py_XDECREF(exc);
3229 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003230 return NULL;
3231}
3232
Victor Stinnerad158722010-10-27 00:25:46 +00003233PyObject *
3234PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003235{
Victor Stinner99b95382011-07-04 14:23:54 +02003236#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003237 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003238#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003239 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003240#else
Victor Stinner793b5312011-04-27 00:24:21 +02003241 PyInterpreterState *interp = PyThreadState_GET()->interp;
3242 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3243 cannot use it to encode and decode filenames before it is loaded. Load
3244 the Python codec requires to encode at least its own filename. Use the C
3245 version of the locale codec until the codec registry is initialized and
3246 the Python codec is loaded.
3247
3248 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3249 cannot only rely on it: check also interp->fscodec_initialized for
3250 subinterpreters. */
3251 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003252 return PyUnicode_AsEncodedString(unicode,
3253 Py_FileSystemDefaultEncoding,
3254 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003255 }
3256 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003257 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003258 }
Victor Stinnerad158722010-10-27 00:25:46 +00003259#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003260}
3261
Alexander Belopolsky40018472011-02-26 01:02:56 +00003262PyObject *
3263PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003264 const char *encoding,
3265 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003266{
3267 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003268 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003269
Guido van Rossumd57fd912000-03-10 22:53:23 +00003270 if (!PyUnicode_Check(unicode)) {
3271 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003272 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003273 }
Fred Drakee4315f52000-05-09 19:53:39 +00003274
Fred Drakee4315f52000-05-09 19:53:39 +00003275 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003276 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003277 if ((strcmp(lower, "utf-8") == 0) ||
3278 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003279 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003280 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003281 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003282 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003283 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003284 }
Victor Stinner37296e82010-06-10 13:36:23 +00003285 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003286 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003287 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003288 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003289#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003290 else if (strcmp(lower, "mbcs") == 0)
3291 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003292#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003293 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003294 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003295 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003296
3297 /* Encode via the codec registry */
3298 v = PyCodec_Encode(unicode, encoding, errors);
3299 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003300 return NULL;
3301
3302 /* The normal path */
3303 if (PyBytes_Check(v))
3304 return v;
3305
3306 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003307 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003308 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003309 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003310
3311 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3312 "encoder %s returned bytearray instead of bytes",
3313 encoding);
3314 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003315 Py_DECREF(v);
3316 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003317 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003318
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003319 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3320 Py_DECREF(v);
3321 return b;
3322 }
3323
3324 PyErr_Format(PyExc_TypeError,
3325 "encoder did not return a bytes object (type=%.400s)",
3326 Py_TYPE(v)->tp_name);
3327 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003328 return NULL;
3329}
3330
Alexander Belopolsky40018472011-02-26 01:02:56 +00003331PyObject *
3332PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003333 const char *encoding,
3334 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003335{
3336 PyObject *v;
3337
3338 if (!PyUnicode_Check(unicode)) {
3339 PyErr_BadArgument();
3340 goto onError;
3341 }
3342
3343 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003344 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003345
3346 /* Encode via the codec registry */
3347 v = PyCodec_Encode(unicode, encoding, errors);
3348 if (v == NULL)
3349 goto onError;
3350 if (!PyUnicode_Check(v)) {
3351 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003352 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003353 Py_TYPE(v)->tp_name);
3354 Py_DECREF(v);
3355 goto onError;
3356 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003357 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003358
Benjamin Peterson29060642009-01-31 22:14:21 +00003359 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003360 return NULL;
3361}
3362
Victor Stinner2f197072011-12-17 07:08:30 +01003363static size_t
3364mbstowcs_errorpos(const char *str, size_t len)
3365{
3366#ifdef HAVE_MBRTOWC
3367 const char *start = str;
3368 mbstate_t mbs;
3369 size_t converted;
3370 wchar_t ch;
3371
3372 memset(&mbs, 0, sizeof mbs);
3373 while (len)
3374 {
3375 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3376 if (converted == 0)
3377 /* Reached end of string */
3378 break;
3379 if (converted == (size_t)-1 || converted == (size_t)-2) {
3380 /* Conversion error or incomplete character */
3381 return str - start;
3382 }
3383 else {
3384 str += converted;
3385 len -= converted;
3386 }
3387 }
3388 /* failed to find the undecodable byte sequence */
3389 return 0;
3390#endif
3391 return 0;
3392}
3393
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003394PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003395PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003396 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003397{
3398 wchar_t smallbuf[256];
3399 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3400 wchar_t *wstr;
3401 size_t wlen, wlen2;
3402 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003403 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003404 size_t error_pos;
3405 char *errmsg;
3406 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003407
3408 if (locale_error_handler(errors, &surrogateescape) < 0)
3409 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003410
3411 if (str[len] != '\0' || len != strlen(str)) {
3412 PyErr_SetString(PyExc_TypeError, "embedded null character");
3413 return NULL;
3414 }
3415
3416 if (surrogateescape)
3417 {
3418 wstr = _Py_char2wchar(str, &wlen);
3419 if (wstr == NULL) {
3420 if (wlen == (size_t)-1)
3421 PyErr_NoMemory();
3422 else
3423 PyErr_SetFromErrno(PyExc_OSError);
3424 return NULL;
3425 }
3426
3427 unicode = PyUnicode_FromWideChar(wstr, wlen);
3428 PyMem_Free(wstr);
3429 }
3430 else {
3431#ifndef HAVE_BROKEN_MBSTOWCS
3432 wlen = mbstowcs(NULL, str, 0);
3433#else
3434 wlen = len;
3435#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003436 if (wlen == (size_t)-1)
3437 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003438 if (wlen+1 <= smallbuf_len) {
3439 wstr = smallbuf;
3440 }
3441 else {
3442 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3443 return PyErr_NoMemory();
3444
3445 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3446 if (!wstr)
3447 return PyErr_NoMemory();
3448 }
3449
3450 /* This shouldn't fail now */
3451 wlen2 = mbstowcs(wstr, str, wlen+1);
3452 if (wlen2 == (size_t)-1) {
3453 if (wstr != smallbuf)
3454 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003455 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003456 }
3457#ifdef HAVE_BROKEN_MBSTOWCS
3458 assert(wlen2 == wlen);
3459#endif
3460 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3461 if (wstr != smallbuf)
3462 PyMem_Free(wstr);
3463 }
3464 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003465
3466decode_error:
3467 errmsg = strerror(errno);
3468 assert(errmsg != NULL);
3469
3470 error_pos = mbstowcs_errorpos(str, len);
3471 if (errmsg != NULL) {
3472 size_t errlen;
3473 wstr = _Py_char2wchar(errmsg, &errlen);
3474 if (wstr != NULL) {
3475 reason = PyUnicode_FromWideChar(wstr, errlen);
3476 PyMem_Free(wstr);
3477 } else
3478 errmsg = NULL;
3479 }
3480 if (errmsg == NULL)
3481 reason = PyUnicode_FromString(
3482 "mbstowcs() encountered an invalid multibyte sequence");
3483 if (reason == NULL)
3484 return NULL;
3485
3486 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3487 "locale", str, len,
3488 (Py_ssize_t)error_pos,
3489 (Py_ssize_t)(error_pos+1),
3490 reason);
3491 Py_DECREF(reason);
3492 if (exc != NULL) {
3493 PyCodec_StrictErrors(exc);
3494 Py_XDECREF(exc);
3495 }
3496 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003497}
3498
3499PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003500PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003501{
3502 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003503 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003504}
3505
3506
3507PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003508PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003509 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003510 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3511}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003512
Christian Heimes5894ba72007-11-04 11:43:14 +00003513PyObject*
3514PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3515{
Victor Stinner99b95382011-07-04 14:23:54 +02003516#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003517 return PyUnicode_DecodeMBCS(s, size, NULL);
3518#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003519 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003520#else
Victor Stinner793b5312011-04-27 00:24:21 +02003521 PyInterpreterState *interp = PyThreadState_GET()->interp;
3522 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3523 cannot use it to encode and decode filenames before it is loaded. Load
3524 the Python codec requires to encode at least its own filename. Use the C
3525 version of the locale codec until the codec registry is initialized and
3526 the Python codec is loaded.
3527
3528 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3529 cannot only rely on it: check also interp->fscodec_initialized for
3530 subinterpreters. */
3531 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003532 return PyUnicode_Decode(s, size,
3533 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003534 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003535 }
3536 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003537 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003538 }
Victor Stinnerad158722010-10-27 00:25:46 +00003539#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003540}
3541
Martin v. Löwis011e8422009-05-05 04:43:17 +00003542
3543int
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003544_PyUnicode_HasNULChars(PyObject* str)
Antoine Pitrou13348842012-01-29 18:36:34 +01003545{
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003546 Py_ssize_t pos;
Antoine Pitrou13348842012-01-29 18:36:34 +01003547
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003548 if (PyUnicode_READY(str) == -1)
Antoine Pitrou13348842012-01-29 18:36:34 +01003549 return -1;
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003550 pos = findchar(PyUnicode_DATA(str), PyUnicode_KIND(str),
3551 PyUnicode_GET_LENGTH(str), '\0', 1);
3552 if (pos == -1)
3553 return 0;
3554 else
3555 return 1;
Antoine Pitrou13348842012-01-29 18:36:34 +01003556}
3557
Antoine Pitrou13348842012-01-29 18:36:34 +01003558int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003559PyUnicode_FSConverter(PyObject* arg, void* addr)
3560{
3561 PyObject *output = NULL;
3562 Py_ssize_t size;
3563 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003564 if (arg == NULL) {
3565 Py_DECREF(*(PyObject**)addr);
3566 return 1;
3567 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003568 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003569 output = arg;
3570 Py_INCREF(output);
3571 }
3572 else {
3573 arg = PyUnicode_FromObject(arg);
3574 if (!arg)
3575 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003576 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003577 Py_DECREF(arg);
3578 if (!output)
3579 return 0;
3580 if (!PyBytes_Check(output)) {
3581 Py_DECREF(output);
3582 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3583 return 0;
3584 }
3585 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003586 size = PyBytes_GET_SIZE(output);
3587 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003588 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003589 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003590 Py_DECREF(output);
3591 return 0;
3592 }
3593 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003594 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003595}
3596
3597
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003598int
3599PyUnicode_FSDecoder(PyObject* arg, void* addr)
3600{
3601 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003602 if (arg == NULL) {
3603 Py_DECREF(*(PyObject**)addr);
3604 return 1;
3605 }
3606 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003607 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003608 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003609 output = arg;
3610 Py_INCREF(output);
3611 }
3612 else {
3613 arg = PyBytes_FromObject(arg);
3614 if (!arg)
3615 return 0;
3616 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3617 PyBytes_GET_SIZE(arg));
3618 Py_DECREF(arg);
3619 if (!output)
3620 return 0;
3621 if (!PyUnicode_Check(output)) {
3622 Py_DECREF(output);
3623 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3624 return 0;
3625 }
3626 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003627 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003628 Py_DECREF(output);
3629 return 0;
3630 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003631 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003632 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003633 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3634 Py_DECREF(output);
3635 return 0;
3636 }
3637 *(PyObject**)addr = output;
3638 return Py_CLEANUP_SUPPORTED;
3639}
3640
3641
Martin v. Löwis5b222132007-06-10 09:51:05 +00003642char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003643PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003644{
Christian Heimesf3863112007-11-22 07:46:41 +00003645 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003646
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003647 if (!PyUnicode_Check(unicode)) {
3648 PyErr_BadArgument();
3649 return NULL;
3650 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003651 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003652 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003653
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003654 if (PyUnicode_UTF8(unicode) == NULL) {
3655 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003656 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3657 if (bytes == NULL)
3658 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003659 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3660 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003661 Py_DECREF(bytes);
3662 return NULL;
3663 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003664 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3665 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3666 PyBytes_AS_STRING(bytes),
3667 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003668 Py_DECREF(bytes);
3669 }
3670
3671 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003672 *psize = PyUnicode_UTF8_LENGTH(unicode);
3673 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003674}
3675
3676char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003677PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003678{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003679 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3680}
3681
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003682Py_UNICODE *
3683PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3684{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003685 const unsigned char *one_byte;
3686#if SIZEOF_WCHAR_T == 4
3687 const Py_UCS2 *two_bytes;
3688#else
3689 const Py_UCS4 *four_bytes;
3690 const Py_UCS4 *ucs4_end;
3691 Py_ssize_t num_surrogates;
3692#endif
3693 wchar_t *w;
3694 wchar_t *wchar_end;
3695
3696 if (!PyUnicode_Check(unicode)) {
3697 PyErr_BadArgument();
3698 return NULL;
3699 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003700 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003701 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003702 assert(_PyUnicode_KIND(unicode) != 0);
3703 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003704
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003705 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003706#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003707 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3708 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003709 num_surrogates = 0;
3710
3711 for (; four_bytes < ucs4_end; ++four_bytes) {
3712 if (*four_bytes > 0xFFFF)
3713 ++num_surrogates;
3714 }
3715
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003716 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3717 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3718 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003719 PyErr_NoMemory();
3720 return NULL;
3721 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003722 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003723
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003724 w = _PyUnicode_WSTR(unicode);
3725 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3726 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003727 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3728 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003729 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003730 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003731 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3732 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003733 }
3734 else
3735 *w = *four_bytes;
3736
3737 if (w > wchar_end) {
3738 assert(0 && "Miscalculated string end");
3739 }
3740 }
3741 *w = 0;
3742#else
3743 /* sizeof(wchar_t) == 4 */
3744 Py_FatalError("Impossible unicode object state, wstr and str "
3745 "should share memory already.");
3746 return NULL;
3747#endif
3748 }
3749 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003750 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3751 (_PyUnicode_LENGTH(unicode) + 1));
3752 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003753 PyErr_NoMemory();
3754 return NULL;
3755 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003756 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3757 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3758 w = _PyUnicode_WSTR(unicode);
3759 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003760
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003761 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3762 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003763 for (; w < wchar_end; ++one_byte, ++w)
3764 *w = *one_byte;
3765 /* null-terminate the wstr */
3766 *w = 0;
3767 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003768 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003769#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003770 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003771 for (; w < wchar_end; ++two_bytes, ++w)
3772 *w = *two_bytes;
3773 /* null-terminate the wstr */
3774 *w = 0;
3775#else
3776 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003777 PyObject_FREE(_PyUnicode_WSTR(unicode));
3778 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003779 Py_FatalError("Impossible unicode object state, wstr "
3780 "and str should share memory already.");
3781 return NULL;
3782#endif
3783 }
3784 else {
3785 assert(0 && "This should never happen.");
3786 }
3787 }
3788 }
3789 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003790 *size = PyUnicode_WSTR_LENGTH(unicode);
3791 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003792}
3793
Alexander Belopolsky40018472011-02-26 01:02:56 +00003794Py_UNICODE *
3795PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003796{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003797 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003798}
3799
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003800
Alexander Belopolsky40018472011-02-26 01:02:56 +00003801Py_ssize_t
3802PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003803{
3804 if (!PyUnicode_Check(unicode)) {
3805 PyErr_BadArgument();
3806 goto onError;
3807 }
3808 return PyUnicode_GET_SIZE(unicode);
3809
Benjamin Peterson29060642009-01-31 22:14:21 +00003810 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003811 return -1;
3812}
3813
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003814Py_ssize_t
3815PyUnicode_GetLength(PyObject *unicode)
3816{
Victor Stinner07621332012-06-16 04:53:46 +02003817 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003818 PyErr_BadArgument();
3819 return -1;
3820 }
Victor Stinner07621332012-06-16 04:53:46 +02003821 if (PyUnicode_READY(unicode) == -1)
3822 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003823 return PyUnicode_GET_LENGTH(unicode);
3824}
3825
3826Py_UCS4
3827PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3828{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003829 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3830 PyErr_BadArgument();
3831 return (Py_UCS4)-1;
3832 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003833 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003834 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003835 return (Py_UCS4)-1;
3836 }
3837 return PyUnicode_READ_CHAR(unicode, index);
3838}
3839
3840int
3841PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3842{
3843 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003844 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003845 return -1;
3846 }
Victor Stinner488fa492011-12-12 00:01:39 +01003847 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003848 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003849 PyErr_SetString(PyExc_IndexError, "string index out of range");
3850 return -1;
3851 }
Victor Stinner488fa492011-12-12 00:01:39 +01003852 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003853 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01003854 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3855 PyErr_SetString(PyExc_ValueError, "character out of range");
3856 return -1;
3857 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003858 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3859 index, ch);
3860 return 0;
3861}
3862
Alexander Belopolsky40018472011-02-26 01:02:56 +00003863const char *
3864PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003865{
Victor Stinner42cb4622010-09-01 19:39:01 +00003866 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003867}
3868
Victor Stinner554f3f02010-06-16 23:33:54 +00003869/* create or adjust a UnicodeDecodeError */
3870static void
3871make_decode_exception(PyObject **exceptionObject,
3872 const char *encoding,
3873 const char *input, Py_ssize_t length,
3874 Py_ssize_t startpos, Py_ssize_t endpos,
3875 const char *reason)
3876{
3877 if (*exceptionObject == NULL) {
3878 *exceptionObject = PyUnicodeDecodeError_Create(
3879 encoding, input, length, startpos, endpos, reason);
3880 }
3881 else {
3882 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3883 goto onError;
3884 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3885 goto onError;
3886 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3887 goto onError;
3888 }
3889 return;
3890
3891onError:
3892 Py_DECREF(*exceptionObject);
3893 *exceptionObject = NULL;
3894}
3895
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003896#ifdef HAVE_MBCS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003897/* error handling callback helper:
3898 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003899 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003900 and adjust various state variables.
3901 return 0 on success, -1 on error
3902*/
3903
Alexander Belopolsky40018472011-02-26 01:02:56 +00003904static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003905unicode_decode_call_errorhandler_wchar(
3906 const char *errors, PyObject **errorHandler,
3907 const char *encoding, const char *reason,
3908 const char **input, const char **inend, Py_ssize_t *startinpos,
3909 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3910 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003911{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003912 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003913
3914 PyObject *restuple = NULL;
3915 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01003916 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003917 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003918 Py_ssize_t requiredsize;
3919 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003920 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003921 wchar_t *repwstr;
3922 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003923
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003924 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
3925 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01003926
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003927 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003928 *errorHandler = PyCodec_LookupError(errors);
3929 if (*errorHandler == NULL)
3930 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003931 }
3932
Victor Stinner554f3f02010-06-16 23:33:54 +00003933 make_decode_exception(exceptionObject,
3934 encoding,
3935 *input, *inend - *input,
3936 *startinpos, *endinpos,
3937 reason);
3938 if (*exceptionObject == NULL)
3939 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003940
3941 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3942 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003943 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003944 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003945 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003946 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003947 }
3948 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003949 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003950
3951 /* Copy back the bytes variables, which might have been modified by the
3952 callback */
3953 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3954 if (!inputobj)
3955 goto onError;
3956 if (!PyBytes_Check(inputobj)) {
3957 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
3958 }
3959 *input = PyBytes_AS_STRING(inputobj);
3960 insize = PyBytes_GET_SIZE(inputobj);
3961 *inend = *input + insize;
3962 /* we can DECREF safely, as the exception has another reference,
3963 so the object won't go away. */
3964 Py_DECREF(inputobj);
3965
3966 if (newpos<0)
3967 newpos = insize+newpos;
3968 if (newpos<0 || newpos>insize) {
3969 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3970 goto onError;
3971 }
3972
3973 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
3974 if (repwstr == NULL)
3975 goto onError;
3976 /* need more space? (at least enough for what we
3977 have+the replacement+the rest of the string (starting
3978 at the new input position), so we won't have to check space
3979 when there are no errors in the rest of the string) */
3980 requiredsize = *outpos + repwlen + insize-newpos;
3981 if (requiredsize > outsize) {
3982 if (requiredsize < 2*outsize)
3983 requiredsize = 2*outsize;
3984 if (unicode_resize(output, requiredsize) < 0)
3985 goto onError;
3986 }
3987 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
3988 *outpos += repwlen;
3989
3990 *endinpos = newpos;
3991 *inptr = *input + newpos;
3992
3993 /* we made it! */
3994 Py_XDECREF(restuple);
3995 return 0;
3996
3997 onError:
3998 Py_XDECREF(restuple);
3999 return -1;
4000}
4001#endif /* HAVE_MBCS */
4002
4003static int
4004unicode_decode_call_errorhandler_writer(
4005 const char *errors, PyObject **errorHandler,
4006 const char *encoding, const char *reason,
4007 const char **input, const char **inend, Py_ssize_t *startinpos,
4008 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4009 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4010{
4011 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4012
4013 PyObject *restuple = NULL;
4014 PyObject *repunicode = NULL;
4015 Py_ssize_t insize;
4016 Py_ssize_t newpos;
4017 PyObject *inputobj = NULL;
4018
4019 if (*errorHandler == NULL) {
4020 *errorHandler = PyCodec_LookupError(errors);
4021 if (*errorHandler == NULL)
4022 goto onError;
4023 }
4024
4025 make_decode_exception(exceptionObject,
4026 encoding,
4027 *input, *inend - *input,
4028 *startinpos, *endinpos,
4029 reason);
4030 if (*exceptionObject == NULL)
4031 goto onError;
4032
4033 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4034 if (restuple == NULL)
4035 goto onError;
4036 if (!PyTuple_Check(restuple)) {
4037 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4038 goto onError;
4039 }
4040 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004041 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004042
4043 /* Copy back the bytes variables, which might have been modified by the
4044 callback */
4045 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4046 if (!inputobj)
4047 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004048 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004049 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004050 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004051 *input = PyBytes_AS_STRING(inputobj);
4052 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004053 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004054 /* we can DECREF safely, as the exception has another reference,
4055 so the object won't go away. */
4056 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004057
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004058 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004059 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004060 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004061 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4062 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004063 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004064
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004065 writer->overallocate = 1;
4066 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4067 return
4068
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004069 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004070 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004071
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004072 /* we made it! */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004073 Py_XDECREF(restuple);
4074 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004075
Benjamin Peterson29060642009-01-31 22:14:21 +00004076 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004077 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004078 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004079}
4080
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004081/* --- UTF-7 Codec -------------------------------------------------------- */
4082
Antoine Pitrou244651a2009-05-04 18:56:13 +00004083/* See RFC2152 for details. We encode conservatively and decode liberally. */
4084
4085/* Three simple macros defining base-64. */
4086
4087/* Is c a base-64 character? */
4088
4089#define IS_BASE64(c) \
4090 (((c) >= 'A' && (c) <= 'Z') || \
4091 ((c) >= 'a' && (c) <= 'z') || \
4092 ((c) >= '0' && (c) <= '9') || \
4093 (c) == '+' || (c) == '/')
4094
4095/* given that c is a base-64 character, what is its base-64 value? */
4096
4097#define FROM_BASE64(c) \
4098 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4099 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4100 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4101 (c) == '+' ? 62 : 63)
4102
4103/* What is the base-64 character of the bottom 6 bits of n? */
4104
4105#define TO_BASE64(n) \
4106 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4107
4108/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4109 * decoded as itself. We are permissive on decoding; the only ASCII
4110 * byte not decoding to itself is the + which begins a base64
4111 * string. */
4112
4113#define DECODE_DIRECT(c) \
4114 ((c) <= 127 && (c) != '+')
4115
4116/* The UTF-7 encoder treats ASCII characters differently according to
4117 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4118 * the above). See RFC2152. This array identifies these different
4119 * sets:
4120 * 0 : "Set D"
4121 * alphanumeric and '(),-./:?
4122 * 1 : "Set O"
4123 * !"#$%&*;<=>@[]^_`{|}
4124 * 2 : "whitespace"
4125 * ht nl cr sp
4126 * 3 : special (must be base64 encoded)
4127 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4128 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004129
Tim Petersced69f82003-09-16 20:30:58 +00004130static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004131char utf7_category[128] = {
4132/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4133 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4134/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4135 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4136/* sp ! " # $ % & ' ( ) * + , - . / */
4137 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4138/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4139 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4140/* @ A B C D E F G H I J K L M N O */
4141 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4142/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4143 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4144/* ` a b c d e f g h i j k l m n o */
4145 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4146/* p q r s t u v w x y z { | } ~ del */
4147 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004148};
4149
Antoine Pitrou244651a2009-05-04 18:56:13 +00004150/* ENCODE_DIRECT: this character should be encoded as itself. The
4151 * answer depends on whether we are encoding set O as itself, and also
4152 * on whether we are encoding whitespace as itself. RFC2152 makes it
4153 * clear that the answers to these questions vary between
4154 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004155
Antoine Pitrou244651a2009-05-04 18:56:13 +00004156#define ENCODE_DIRECT(c, directO, directWS) \
4157 ((c) < 128 && (c) > 0 && \
4158 ((utf7_category[(c)] == 0) || \
4159 (directWS && (utf7_category[(c)] == 2)) || \
4160 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004161
Alexander Belopolsky40018472011-02-26 01:02:56 +00004162PyObject *
4163PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004164 Py_ssize_t size,
4165 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004166{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004167 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4168}
4169
Antoine Pitrou244651a2009-05-04 18:56:13 +00004170/* The decoder. The only state we preserve is our read position,
4171 * i.e. how many characters we have consumed. So if we end in the
4172 * middle of a shift sequence we have to back off the read position
4173 * and the output to the beginning of the sequence, otherwise we lose
4174 * all the shift state (seen bits, number of bits seen, high
4175 * surrogate). */
4176
Alexander Belopolsky40018472011-02-26 01:02:56 +00004177PyObject *
4178PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004179 Py_ssize_t size,
4180 const char *errors,
4181 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004182{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004183 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004184 Py_ssize_t startinpos;
4185 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004186 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004187 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004188 const char *errmsg = "";
4189 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004190 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004191 unsigned int base64bits = 0;
4192 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004193 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004194 PyObject *errorHandler = NULL;
4195 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004196
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004197 if (size == 0) {
4198 if (consumed)
4199 *consumed = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004200 Py_INCREF(unicode_empty);
4201 return unicode_empty;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004202 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004203
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004204 /* Start off assuming it's all ASCII. Widen later as necessary. */
4205 _PyUnicodeWriter_Init(&writer, 0);
4206 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
4207 goto onError;
4208
4209 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004210 e = s + size;
4211
4212 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004213 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004214 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004215 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004216
Antoine Pitrou244651a2009-05-04 18:56:13 +00004217 if (inShift) { /* in a base-64 section */
4218 if (IS_BASE64(ch)) { /* consume a base-64 character */
4219 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4220 base64bits += 6;
4221 s++;
4222 if (base64bits >= 16) {
4223 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004224 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004225 base64bits -= 16;
4226 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4227 if (surrogate) {
4228 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004229 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4230 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004231 if (_PyUnicodeWriter_Prepare(&writer, 1, ch2) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004232 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004233 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch2);
4234 writer.pos++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004235 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004236 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004237 }
4238 else {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004239 if (_PyUnicodeWriter_Prepare(&writer, 1, surrogate) == -1)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004240 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004241 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, surrogate);
4242 writer.pos++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004243 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004244 }
4245 }
Victor Stinner551ac952011-11-29 22:58:13 +01004246 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004247 /* first surrogate */
4248 surrogate = outCh;
4249 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004250 else {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004251 if (_PyUnicodeWriter_Prepare(&writer, 1, outCh) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004252 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004253 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, outCh);
4254 writer.pos++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004255 }
4256 }
4257 }
4258 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004259 inShift = 0;
4260 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004261 if (surrogate) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004262 if (_PyUnicodeWriter_Prepare(&writer, 1, surrogate) == -1)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004263 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004264 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, surrogate);
4265 writer.pos++;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004266 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004267 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004268 if (base64bits > 0) { /* left-over bits */
4269 if (base64bits >= 6) {
4270 /* We've seen at least one base-64 character */
4271 errmsg = "partial character in shift sequence";
4272 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004273 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004274 else {
4275 /* Some bits remain; they should be zero */
4276 if (base64buffer != 0) {
4277 errmsg = "non-zero padding bits in shift sequence";
4278 goto utf7Error;
4279 }
4280 }
4281 }
4282 if (ch != '-') {
4283 /* '-' is absorbed; other terminating
4284 characters are preserved */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004285 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004286 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004287 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
4288 writer.pos++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004289 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004290 }
4291 }
4292 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004293 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004294 s++; /* consume '+' */
4295 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004296 s++;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004297 if (_PyUnicodeWriter_Prepare(&writer, 1, '+') == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004298 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004299 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '+');
4300 writer.pos++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004301 }
4302 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004303 inShift = 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004304 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004305 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004306 }
4307 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004308 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004309 s++;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004310 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
4311 goto onError;
4312 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
4313 writer.pos++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004314 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004315 else {
4316 startinpos = s-starts;
4317 s++;
4318 errmsg = "unexpected special character";
4319 goto utf7Error;
4320 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004321 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004322utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004323 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004324 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004325 errors, &errorHandler,
4326 "utf7", errmsg,
4327 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004328 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004329 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004330 }
4331
Antoine Pitrou244651a2009-05-04 18:56:13 +00004332 /* end of string */
4333
4334 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4335 /* if we're in an inconsistent state, that's an error */
4336 if (surrogate ||
4337 (base64bits >= 6) ||
4338 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004339 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004340 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004341 errors, &errorHandler,
4342 "utf7", "unterminated shift sequence",
4343 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004344 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004345 goto onError;
4346 if (s < e)
4347 goto restart;
4348 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004349 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004350
4351 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004352 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004353 if (inShift) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004354 writer.pos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004355 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004356 }
4357 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004358 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004359 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004360 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004361
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004362 Py_XDECREF(errorHandler);
4363 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004364 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004365
Benjamin Peterson29060642009-01-31 22:14:21 +00004366 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004367 Py_XDECREF(errorHandler);
4368 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004369 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004370 return NULL;
4371}
4372
4373
Alexander Belopolsky40018472011-02-26 01:02:56 +00004374PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004375_PyUnicode_EncodeUTF7(PyObject *str,
4376 int base64SetO,
4377 int base64WhiteSpace,
4378 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004379{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004380 int kind;
4381 void *data;
4382 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004383 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004384 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004385 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004386 unsigned int base64bits = 0;
4387 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004388 char * out;
4389 char * start;
4390
Benjamin Petersonbac79492012-01-14 13:34:47 -05004391 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004392 return NULL;
4393 kind = PyUnicode_KIND(str);
4394 data = PyUnicode_DATA(str);
4395 len = PyUnicode_GET_LENGTH(str);
4396
4397 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004398 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004399
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004400 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004401 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004402 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004403 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004404 if (v == NULL)
4405 return NULL;
4406
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004407 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004408 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004409 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004410
Antoine Pitrou244651a2009-05-04 18:56:13 +00004411 if (inShift) {
4412 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4413 /* shifting out */
4414 if (base64bits) { /* output remaining bits */
4415 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4416 base64buffer = 0;
4417 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004418 }
4419 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004420 /* Characters not in the BASE64 set implicitly unshift the sequence
4421 so no '-' is required, except if the character is itself a '-' */
4422 if (IS_BASE64(ch) || ch == '-') {
4423 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004424 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004425 *out++ = (char) ch;
4426 }
4427 else {
4428 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004429 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004430 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004431 else { /* not in a shift sequence */
4432 if (ch == '+') {
4433 *out++ = '+';
4434 *out++ = '-';
4435 }
4436 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4437 *out++ = (char) ch;
4438 }
4439 else {
4440 *out++ = '+';
4441 inShift = 1;
4442 goto encode_char;
4443 }
4444 }
4445 continue;
4446encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004447 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004448 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004449
Antoine Pitrou244651a2009-05-04 18:56:13 +00004450 /* code first surrogate */
4451 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004452 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004453 while (base64bits >= 6) {
4454 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4455 base64bits -= 6;
4456 }
4457 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004458 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004459 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004460 base64bits += 16;
4461 base64buffer = (base64buffer << 16) | ch;
4462 while (base64bits >= 6) {
4463 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4464 base64bits -= 6;
4465 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004466 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004467 if (base64bits)
4468 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4469 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004470 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004471 if (_PyBytes_Resize(&v, out - start) < 0)
4472 return NULL;
4473 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004474}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004475PyObject *
4476PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4477 Py_ssize_t size,
4478 int base64SetO,
4479 int base64WhiteSpace,
4480 const char *errors)
4481{
4482 PyObject *result;
4483 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4484 if (tmp == NULL)
4485 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004486 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004487 base64WhiteSpace, errors);
4488 Py_DECREF(tmp);
4489 return result;
4490}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004491
Antoine Pitrou244651a2009-05-04 18:56:13 +00004492#undef IS_BASE64
4493#undef FROM_BASE64
4494#undef TO_BASE64
4495#undef DECODE_DIRECT
4496#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004497
Guido van Rossumd57fd912000-03-10 22:53:23 +00004498/* --- UTF-8 Codec -------------------------------------------------------- */
4499
Alexander Belopolsky40018472011-02-26 01:02:56 +00004500PyObject *
4501PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004502 Py_ssize_t size,
4503 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004504{
Walter Dörwald69652032004-09-07 20:24:22 +00004505 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4506}
4507
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004508#include "stringlib/asciilib.h"
4509#include "stringlib/codecs.h"
4510#include "stringlib/undef.h"
4511
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004512#include "stringlib/ucs1lib.h"
4513#include "stringlib/codecs.h"
4514#include "stringlib/undef.h"
4515
4516#include "stringlib/ucs2lib.h"
4517#include "stringlib/codecs.h"
4518#include "stringlib/undef.h"
4519
4520#include "stringlib/ucs4lib.h"
4521#include "stringlib/codecs.h"
4522#include "stringlib/undef.h"
4523
Antoine Pitrouab868312009-01-10 15:40:25 +00004524/* Mask to quickly check whether a C 'long' contains a
4525 non-ASCII, UTF8-encoded char. */
4526#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004527# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004528#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004529# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004530#else
4531# error C 'long' size should be either 4 or 8!
4532#endif
4533
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004534static Py_ssize_t
4535ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004536{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004537 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004538 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004539
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004540#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004541 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4542 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004543 /* Fast path, see in STRINGLIB(utf8_decode) for
4544 an explanation. */
4545 /* Help register allocation */
4546 register const char *_p = p;
4547 register Py_UCS1 * q = dest;
4548 while (_p < aligned_end) {
4549 unsigned long value = *(const unsigned long *) _p;
4550 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004551 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004552 *((unsigned long *)q) = value;
4553 _p += SIZEOF_LONG;
4554 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004555 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004556 p = _p;
4557 while (p < end) {
4558 if ((unsigned char)*p & 0x80)
4559 break;
4560 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004561 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004562 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004563 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004564#endif
4565 while (p < end) {
4566 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4567 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004568 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004569 /* Help register allocation */
4570 register const char *_p = p;
4571 while (_p < aligned_end) {
4572 unsigned long value = *(unsigned long *) _p;
4573 if (value & ASCII_CHAR_MASK)
4574 break;
4575 _p += SIZEOF_LONG;
4576 }
4577 p = _p;
4578 if (_p == end)
4579 break;
4580 }
4581 if ((unsigned char)*p & 0x80)
4582 break;
4583 ++p;
4584 }
4585 memcpy(dest, start, p - start);
4586 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004587}
Antoine Pitrouab868312009-01-10 15:40:25 +00004588
Victor Stinner785938e2011-12-11 20:09:03 +01004589PyObject *
4590PyUnicode_DecodeUTF8Stateful(const char *s,
4591 Py_ssize_t size,
4592 const char *errors,
4593 Py_ssize_t *consumed)
4594{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004595 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004596 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004597 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004598
4599 Py_ssize_t startinpos;
4600 Py_ssize_t endinpos;
4601 const char *errmsg = "";
4602 PyObject *errorHandler = NULL;
4603 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004604
4605 if (size == 0) {
4606 if (consumed)
4607 *consumed = 0;
Victor Stinner382955f2011-12-11 21:44:00 +01004608 Py_INCREF(unicode_empty);
4609 return unicode_empty;
Victor Stinner785938e2011-12-11 20:09:03 +01004610 }
4611
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004612 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4613 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004614 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004615 *consumed = 1;
4616 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004617 }
4618
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004619 _PyUnicodeWriter_Init(&writer, 0);
4620 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
4621 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004622
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004623 writer.pos = ascii_decode(s, end, writer.data);
4624 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004625 while (s < end) {
4626 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004627 int kind = writer.kind;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004628 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004629 if (PyUnicode_IS_ASCII(writer.buffer))
4630 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004631 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004632 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004633 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004634 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004635 } else {
4636 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004637 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004638 }
4639
4640 switch (ch) {
4641 case 0:
4642 if (s == end || consumed)
4643 goto End;
4644 errmsg = "unexpected end of data";
4645 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004646 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004647 break;
4648 case 1:
4649 errmsg = "invalid start byte";
4650 startinpos = s - starts;
4651 endinpos = startinpos + 1;
4652 break;
4653 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004654 case 3:
4655 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004656 errmsg = "invalid continuation byte";
4657 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004658 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004659 break;
4660 default:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004661 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004662 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004663 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
4664 writer.pos++;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004665 continue;
4666 }
4667
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004668 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004669 errors, &errorHandler,
4670 "utf-8", errmsg,
4671 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004672 &writer))
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004673 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004674 }
4675
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004676End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004677 if (consumed)
4678 *consumed = s - starts;
4679
4680 Py_XDECREF(errorHandler);
4681 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004682 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004683
4684onError:
4685 Py_XDECREF(errorHandler);
4686 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004687 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004688 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004689}
4690
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004691#ifdef __APPLE__
4692
4693/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004694 used to decode the command line arguments on Mac OS X.
4695
4696 Return a pointer to a newly allocated wide character string (use
4697 PyMem_Free() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004698
4699wchar_t*
4700_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4701{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004702 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004703 wchar_t *unicode;
4704 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004705
4706 /* Note: size will always be longer than the resulting Unicode
4707 character count */
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004708 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004709 return NULL;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004710 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4711 if (!unicode)
4712 return NULL;
4713
4714 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004715 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004716 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004717 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004718 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004719#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004720 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004721#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004722 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004723#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004724 if (ch > 0xFF) {
4725#if SIZEOF_WCHAR_T == 4
4726 assert(0);
4727#else
4728 assert(Py_UNICODE_IS_SURROGATE(ch));
4729 /* compute and append the two surrogates: */
4730 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4731 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4732#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004733 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004734 else {
4735 if (!ch && s == e)
4736 break;
4737 /* surrogateescape */
4738 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4739 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004740 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004741 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004742 return unicode;
4743}
4744
4745#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004746
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004747/* Primary internal function which creates utf8 encoded bytes objects.
4748
4749 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004750 and allocate exactly as much space needed at the end. Else allocate the
4751 maximum possible needed (4 result bytes per Unicode character), and return
4752 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004753*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004754PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004755_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004756{
Victor Stinner6099a032011-12-18 14:22:26 +01004757 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004758 void *data;
4759 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004760
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004761 if (!PyUnicode_Check(unicode)) {
4762 PyErr_BadArgument();
4763 return NULL;
4764 }
4765
4766 if (PyUnicode_READY(unicode) == -1)
4767 return NULL;
4768
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004769 if (PyUnicode_UTF8(unicode))
4770 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4771 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004772
4773 kind = PyUnicode_KIND(unicode);
4774 data = PyUnicode_DATA(unicode);
4775 size = PyUnicode_GET_LENGTH(unicode);
4776
Benjamin Petersonead6b532011-12-20 17:23:42 -06004777 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01004778 default:
4779 assert(0);
4780 case PyUnicode_1BYTE_KIND:
4781 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4782 assert(!PyUnicode_IS_ASCII(unicode));
4783 return ucs1lib_utf8_encoder(unicode, data, size, errors);
4784 case PyUnicode_2BYTE_KIND:
4785 return ucs2lib_utf8_encoder(unicode, data, size, errors);
4786 case PyUnicode_4BYTE_KIND:
4787 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00004788 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004789}
4790
Alexander Belopolsky40018472011-02-26 01:02:56 +00004791PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004792PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4793 Py_ssize_t size,
4794 const char *errors)
4795{
4796 PyObject *v, *unicode;
4797
4798 unicode = PyUnicode_FromUnicode(s, size);
4799 if (unicode == NULL)
4800 return NULL;
4801 v = _PyUnicode_AsUTF8String(unicode, errors);
4802 Py_DECREF(unicode);
4803 return v;
4804}
4805
4806PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004807PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004808{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004809 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004810}
4811
Walter Dörwald41980ca2007-08-16 21:55:45 +00004812/* --- UTF-32 Codec ------------------------------------------------------- */
4813
4814PyObject *
4815PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004816 Py_ssize_t size,
4817 const char *errors,
4818 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004819{
4820 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4821}
4822
4823PyObject *
4824PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004825 Py_ssize_t size,
4826 const char *errors,
4827 int *byteorder,
4828 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004829{
4830 const char *starts = s;
4831 Py_ssize_t startinpos;
4832 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004833 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004834 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01004835 int le, bo = 0; /* assume native ordering by default */
Walter Dörwald41980ca2007-08-16 21:55:45 +00004836 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004837 PyObject *errorHandler = NULL;
4838 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004839
Walter Dörwald41980ca2007-08-16 21:55:45 +00004840 q = (unsigned char *)s;
4841 e = q + size;
4842
4843 if (byteorder)
4844 bo = *byteorder;
4845
4846 /* Check for BOM marks (U+FEFF) in the input and adjust current
4847 byte order setting accordingly. In native mode, the leading BOM
4848 mark is skipped, in all other modes, it is copied to the output
4849 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01004850 if (bo == 0 && size >= 4) {
4851 Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4852 if (bom == 0x0000FEFF) {
4853 bo = -1;
4854 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00004855 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004856 else if (bom == 0xFFFE0000) {
4857 bo = 1;
4858 q += 4;
4859 }
4860 if (byteorder)
4861 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004862 }
4863
Victor Stinnere64322e2012-10-30 23:12:47 +01004864 if (q == e) {
4865 if (consumed)
4866 *consumed = size;
4867 Py_INCREF(unicode_empty);
4868 return unicode_empty;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004869 }
4870
Victor Stinnere64322e2012-10-30 23:12:47 +01004871#ifdef WORDS_BIGENDIAN
4872 le = bo < 0;
4873#else
4874 le = bo <= 0;
4875#endif
4876
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004877 _PyUnicodeWriter_Init(&writer, 0);
4878 if (_PyUnicodeWriter_Prepare(&writer, (e - q + 3) / 4, 127) == -1)
4879 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01004880
Victor Stinnere64322e2012-10-30 23:12:47 +01004881 while (1) {
4882 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004883 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004884
Victor Stinnere64322e2012-10-30 23:12:47 +01004885 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004886 enum PyUnicode_Kind kind = writer.kind;
4887 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01004888 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004889 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01004890 if (le) {
4891 do {
4892 ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4893 if (ch > maxch)
4894 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004895 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01004896 q += 4;
4897 } while (q <= last);
4898 }
4899 else {
4900 do {
4901 ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
4902 if (ch > maxch)
4903 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004904 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01004905 q += 4;
4906 } while (q <= last);
4907 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004908 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01004909 }
4910
4911 if (ch <= maxch) {
4912 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004913 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01004914 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00004915 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01004916 startinpos = ((const char *)q) - starts;
4917 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00004918 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004919 else {
4920 if (ch < 0x110000) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004921 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
Victor Stinnere64322e2012-10-30 23:12:47 +01004922 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004923 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
4924 writer.pos++;
Victor Stinnere64322e2012-10-30 23:12:47 +01004925 q += 4;
4926 continue;
4927 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004928 errmsg = "codepoint not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01004929 startinpos = ((const char *)q) - starts;
4930 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00004931 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004932
4933 /* The remaining input chars are ignored if the callback
4934 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004935 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004936 errors, &errorHandler,
4937 "utf32", errmsg,
4938 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004939 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004940 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004941 }
4942
Walter Dörwald41980ca2007-08-16 21:55:45 +00004943 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004944 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004945
Walter Dörwald41980ca2007-08-16 21:55:45 +00004946 Py_XDECREF(errorHandler);
4947 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004948 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004949
Benjamin Peterson29060642009-01-31 22:14:21 +00004950 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004951 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004952 Py_XDECREF(errorHandler);
4953 Py_XDECREF(exc);
4954 return NULL;
4955}
4956
4957PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004958_PyUnicode_EncodeUTF32(PyObject *str,
4959 const char *errors,
4960 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004961{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004962 int kind;
4963 void *data;
4964 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004965 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004966 unsigned char *p;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004967 Py_ssize_t nsize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004968 /* Offsets from p for storing byte pairs in the right order. */
Christian Heimes743e0cd2012-10-17 23:52:17 +02004969#if PY_LITTLE_ENDIAN
Walter Dörwald41980ca2007-08-16 21:55:45 +00004970 int iorder[] = {0, 1, 2, 3};
4971#else
4972 int iorder[] = {3, 2, 1, 0};
4973#endif
4974
Benjamin Peterson29060642009-01-31 22:14:21 +00004975#define STORECHAR(CH) \
4976 do { \
4977 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4978 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4979 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4980 p[iorder[0]] = (CH) & 0xff; \
4981 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004982 } while(0)
4983
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004984 if (!PyUnicode_Check(str)) {
4985 PyErr_BadArgument();
4986 return NULL;
4987 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05004988 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004989 return NULL;
4990 kind = PyUnicode_KIND(str);
4991 data = PyUnicode_DATA(str);
4992 len = PyUnicode_GET_LENGTH(str);
4993
4994 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004995 if (nsize > PY_SSIZE_T_MAX / 4)
Benjamin Peterson29060642009-01-31 22:14:21 +00004996 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004997 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004998 if (v == NULL)
4999 return NULL;
5000
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005001 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005002 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005003 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005004 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005005 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005006
5007 if (byteorder == -1) {
5008 /* force LE */
5009 iorder[0] = 0;
5010 iorder[1] = 1;
5011 iorder[2] = 2;
5012 iorder[3] = 3;
5013 }
5014 else if (byteorder == 1) {
5015 /* force BE */
5016 iorder[0] = 3;
5017 iorder[1] = 2;
5018 iorder[2] = 1;
5019 iorder[3] = 0;
5020 }
5021
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005022 for (i = 0; i < len; i++)
5023 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005024
5025 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005026 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005027#undef STORECHAR
5028}
5029
Alexander Belopolsky40018472011-02-26 01:02:56 +00005030PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005031PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5032 Py_ssize_t size,
5033 const char *errors,
5034 int byteorder)
5035{
5036 PyObject *result;
5037 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5038 if (tmp == NULL)
5039 return NULL;
5040 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5041 Py_DECREF(tmp);
5042 return result;
5043}
5044
5045PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005046PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005047{
Victor Stinnerb960b342011-11-20 19:12:52 +01005048 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005049}
5050
Guido van Rossumd57fd912000-03-10 22:53:23 +00005051/* --- UTF-16 Codec ------------------------------------------------------- */
5052
Tim Peters772747b2001-08-09 22:21:55 +00005053PyObject *
5054PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005055 Py_ssize_t size,
5056 const char *errors,
5057 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005058{
Walter Dörwald69652032004-09-07 20:24:22 +00005059 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5060}
5061
5062PyObject *
5063PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005064 Py_ssize_t size,
5065 const char *errors,
5066 int *byteorder,
5067 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005068{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005069 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005070 Py_ssize_t startinpos;
5071 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005072 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005073 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005074 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005075 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005076 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005077 PyObject *errorHandler = NULL;
5078 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005079
Tim Peters772747b2001-08-09 22:21:55 +00005080 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005081 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005082
5083 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005084 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005085
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005086 /* Check for BOM marks (U+FEFF) in the input and adjust current
5087 byte order setting accordingly. In native mode, the leading BOM
5088 mark is skipped, in all other modes, it is copied to the output
5089 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005090 if (bo == 0 && size >= 2) {
5091 const Py_UCS4 bom = (q[1] << 8) | q[0];
5092 if (bom == 0xFEFF) {
5093 q += 2;
5094 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005095 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005096 else if (bom == 0xFFFE) {
5097 q += 2;
5098 bo = 1;
5099 }
5100 if (byteorder)
5101 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005102 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005103
Antoine Pitrou63065d72012-05-15 23:48:04 +02005104 if (q == e) {
5105 if (consumed)
5106 *consumed = size;
5107 Py_INCREF(unicode_empty);
5108 return unicode_empty;
Tim Peters772747b2001-08-09 22:21:55 +00005109 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005110
Christian Heimes743e0cd2012-10-17 23:52:17 +02005111#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005112 native_ordering = bo <= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005113#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005114 native_ordering = bo >= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005115#endif
Tim Peters772747b2001-08-09 22:21:55 +00005116
Antoine Pitrou63065d72012-05-15 23:48:04 +02005117 /* Note: size will always be longer than the resulting Unicode
5118 character count */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005119 _PyUnicodeWriter_Init(&writer, 0);
5120 if (_PyUnicodeWriter_Prepare(&writer, (e - q + 1) / 2, 127) == -1)
5121 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005122
Antoine Pitrou63065d72012-05-15 23:48:04 +02005123 while (1) {
5124 Py_UCS4 ch = 0;
5125 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005126 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005127 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005128 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005129 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005130 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005131 native_ordering);
5132 else
5133 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005134 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005135 native_ordering);
5136 } else if (kind == PyUnicode_2BYTE_KIND) {
5137 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005138 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005139 native_ordering);
5140 } else {
5141 assert(kind == PyUnicode_4BYTE_KIND);
5142 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005143 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005144 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005145 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005146 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005147
Antoine Pitrou63065d72012-05-15 23:48:04 +02005148 switch (ch)
5149 {
5150 case 0:
5151 /* remaining byte at the end? (size should be even) */
5152 if (q == e || consumed)
5153 goto End;
5154 errmsg = "truncated data";
5155 startinpos = ((const char *)q) - starts;
5156 endinpos = ((const char *)e) - starts;
5157 break;
5158 /* The remaining input chars are ignored if the callback
5159 chooses to skip the input */
5160 case 1:
5161 errmsg = "unexpected end of data";
5162 startinpos = ((const char *)q) - 2 - starts;
5163 endinpos = ((const char *)e) - starts;
5164 break;
5165 case 2:
5166 errmsg = "illegal encoding";
5167 startinpos = ((const char *)q) - 2 - starts;
5168 endinpos = startinpos + 2;
5169 break;
5170 case 3:
5171 errmsg = "illegal UTF-16 surrogate";
5172 startinpos = ((const char *)q) - 4 - starts;
5173 endinpos = startinpos + 2;
5174 break;
5175 default:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005176 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005177 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005178 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
5179 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00005180 continue;
5181 }
5182
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005183 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005184 errors,
5185 &errorHandler,
5186 "utf16", errmsg,
5187 &starts,
5188 (const char **)&e,
5189 &startinpos,
5190 &endinpos,
5191 &exc,
5192 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005193 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005194 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005195 }
5196
Antoine Pitrou63065d72012-05-15 23:48:04 +02005197End:
Walter Dörwald69652032004-09-07 20:24:22 +00005198 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005199 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005200
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005201 Py_XDECREF(errorHandler);
5202 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005203 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005204
Benjamin Peterson29060642009-01-31 22:14:21 +00005205 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005206 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005207 Py_XDECREF(errorHandler);
5208 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005209 return NULL;
5210}
5211
Tim Peters772747b2001-08-09 22:21:55 +00005212PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005213_PyUnicode_EncodeUTF16(PyObject *str,
5214 const char *errors,
5215 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005216{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005217 enum PyUnicode_Kind kind;
5218 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005219 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005220 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005221 unsigned short *out;
5222 Py_ssize_t bytesize;
5223 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005224#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005225 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005226#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005227 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005228#endif
5229
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005230 if (!PyUnicode_Check(str)) {
5231 PyErr_BadArgument();
5232 return NULL;
5233 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005234 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005235 return NULL;
5236 kind = PyUnicode_KIND(str);
5237 data = PyUnicode_DATA(str);
5238 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005239
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005240 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005241 if (kind == PyUnicode_4BYTE_KIND) {
5242 const Py_UCS4 *in = (const Py_UCS4 *)data;
5243 const Py_UCS4 *end = in + len;
5244 while (in < end)
5245 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005246 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005247 }
5248 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005249 return PyErr_NoMemory();
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005250 bytesize = (len + pairs + (byteorder == 0)) * 2;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005251 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005252 if (v == NULL)
5253 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005254
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005255 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005256 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005257 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005258 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005259 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005260 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005261 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005262
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005263 switch (kind) {
5264 case PyUnicode_1BYTE_KIND: {
5265 ucs1lib_utf16_encode(out, (const Py_UCS1 *)data, len, native_ordering);
5266 break;
Tim Peters772747b2001-08-09 22:21:55 +00005267 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005268 case PyUnicode_2BYTE_KIND: {
5269 ucs2lib_utf16_encode(out, (const Py_UCS2 *)data, len, native_ordering);
5270 break;
Tim Peters772747b2001-08-09 22:21:55 +00005271 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005272 case PyUnicode_4BYTE_KIND: {
5273 ucs4lib_utf16_encode(out, (const Py_UCS4 *)data, len, native_ordering);
5274 break;
5275 }
5276 default:
5277 assert(0);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005278 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005279
5280 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005281 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005282}
5283
Alexander Belopolsky40018472011-02-26 01:02:56 +00005284PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005285PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5286 Py_ssize_t size,
5287 const char *errors,
5288 int byteorder)
5289{
5290 PyObject *result;
5291 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5292 if (tmp == NULL)
5293 return NULL;
5294 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5295 Py_DECREF(tmp);
5296 return result;
5297}
5298
5299PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005300PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005301{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005302 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005303}
5304
5305/* --- Unicode Escape Codec ----------------------------------------------- */
5306
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005307/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5308 if all the escapes in the string make it still a valid ASCII string.
5309 Returns -1 if any escapes were found which cause the string to
5310 pop out of ASCII range. Otherwise returns the length of the
5311 required buffer to hold the string.
5312 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005313static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005314length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5315{
5316 const unsigned char *p = (const unsigned char *)s;
5317 const unsigned char *end = p + size;
5318 Py_ssize_t length = 0;
5319
5320 if (size < 0)
5321 return -1;
5322
5323 for (; p < end; ++p) {
5324 if (*p > 127) {
5325 /* Non-ASCII */
5326 return -1;
5327 }
5328 else if (*p != '\\') {
5329 /* Normal character */
5330 ++length;
5331 }
5332 else {
5333 /* Backslash-escape, check next char */
5334 ++p;
5335 /* Escape sequence reaches till end of string or
5336 non-ASCII follow-up. */
5337 if (p >= end || *p > 127)
5338 return -1;
5339 switch (*p) {
5340 case '\n':
5341 /* backslash + \n result in zero characters */
5342 break;
5343 case '\\': case '\'': case '\"':
5344 case 'b': case 'f': case 't':
5345 case 'n': case 'r': case 'v': case 'a':
5346 ++length;
5347 break;
5348 case '0': case '1': case '2': case '3':
5349 case '4': case '5': case '6': case '7':
5350 case 'x': case 'u': case 'U': case 'N':
5351 /* these do not guarantee ASCII characters */
5352 return -1;
5353 default:
5354 /* count the backslash + the other character */
5355 length += 2;
5356 }
5357 }
5358 }
5359 return length;
5360}
5361
Fredrik Lundh06d12682001-01-24 07:59:11 +00005362static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005363
Alexander Belopolsky40018472011-02-26 01:02:56 +00005364PyObject *
5365PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005366 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005367 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005368{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005369 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005370 Py_ssize_t startinpos;
5371 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005372 int j;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005373 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005374 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005375 char* message;
5376 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005377 PyObject *errorHandler = NULL;
5378 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005379 Py_ssize_t len;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005380
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005381 len = length_of_escaped_ascii_string(s, size);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005382 if (len == 0) {
5383 Py_INCREF(unicode_empty);
5384 return unicode_empty;
5385 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005386
5387 /* After length_of_escaped_ascii_string() there are two alternatives,
5388 either the string is pure ASCII with named escapes like \n, etc.
5389 and we determined it's exact size (common case)
5390 or it contains \x, \u, ... escape sequences. then we create a
5391 legacy wchar string and resize it at the end of this function. */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005392 _PyUnicodeWriter_Init(&writer, 0);
5393 if (len > 0) {
5394 if (_PyUnicodeWriter_Prepare(&writer, len, 127) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005395 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005396 assert(writer.kind == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005397 }
5398 else {
5399 /* Escaped strings will always be longer than the resulting
5400 Unicode string, so we start with size here and then reduce the
5401 length after conversion to the true value.
5402 (but if the error callback returns a long replacement string
5403 we'll have to allocate more space) */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005404 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005405 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005406 }
5407
Guido van Rossumd57fd912000-03-10 22:53:23 +00005408 if (size == 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005409 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005410 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005411
Guido van Rossumd57fd912000-03-10 22:53:23 +00005412 while (s < end) {
5413 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005414 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005415 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005416
5417 /* Non-escape characters are interpreted as Unicode ordinals */
5418 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005419 x = (unsigned char)*s;
5420 s++;
5421 if (_PyUnicodeWriter_Prepare(&writer, 1, x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005422 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005423 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, x);
5424 writer.pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005425 continue;
5426 }
5427
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005428 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005429 /* \ - Escapes */
5430 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005431 c = *s++;
5432 if (s > end)
5433 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005434
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005435 /* The only case in which i == ascii_length is a backslash
5436 followed by a newline. */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005437 assert(writer.pos < writer.size || (writer.pos == writer.size && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005438
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005439 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005440
Benjamin Peterson29060642009-01-31 22:14:21 +00005441 /* \x escapes */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005442#define WRITECHAR(ch) \
5443 do { \
5444 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1) \
5445 goto onError; \
5446 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch); \
5447 writer.pos++; \
5448 } while(0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005449
Guido van Rossumd57fd912000-03-10 22:53:23 +00005450 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005451 case '\\': WRITECHAR('\\'); break;
5452 case '\'': WRITECHAR('\''); break;
5453 case '\"': WRITECHAR('\"'); break;
5454 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005455 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005456 case 'f': WRITECHAR('\014'); break;
5457 case 't': WRITECHAR('\t'); break;
5458 case 'n': WRITECHAR('\n'); break;
5459 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005460 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005461 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005462 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005463 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005464
Benjamin Peterson29060642009-01-31 22:14:21 +00005465 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005466 case '0': case '1': case '2': case '3':
5467 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005468 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005469 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005470 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005471 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005472 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005473 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005474 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005475 break;
5476
Benjamin Peterson29060642009-01-31 22:14:21 +00005477 /* hex escapes */
5478 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005479 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005480 digits = 2;
5481 message = "truncated \\xXX escape";
5482 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005483
Benjamin Peterson29060642009-01-31 22:14:21 +00005484 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005485 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005486 digits = 4;
5487 message = "truncated \\uXXXX escape";
5488 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005489
Benjamin Peterson29060642009-01-31 22:14:21 +00005490 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005491 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005492 digits = 8;
5493 message = "truncated \\UXXXXXXXX escape";
5494 hexescape:
5495 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005496 if (s+digits>end) {
5497 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005498 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005499 errors, &errorHandler,
5500 "unicodeescape", "end of string in escape sequence",
5501 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005502 &writer))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005503 goto onError;
5504 goto nextByte;
5505 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005506 for (j = 0; j < digits; ++j) {
5507 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005508 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005509 endinpos = (s+j+1)-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005510 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005511 errors, &errorHandler,
5512 "unicodeescape", message,
5513 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005514 &writer))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005515 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005516 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005517 }
5518 chr = (chr<<4) & ~0xF;
5519 if (c >= '0' && c <= '9')
5520 chr += c - '0';
5521 else if (c >= 'a' && c <= 'f')
5522 chr += 10 + c - 'a';
5523 else
5524 chr += 10 + c - 'A';
5525 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005526 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005527 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005528 /* _decoding_error will have already written into the
5529 target buffer. */
5530 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005531 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005532 /* when we get here, chr is a 32-bit unicode character */
Victor Stinner8faf8212011-12-08 22:14:11 +01005533 if (chr <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005534 WRITECHAR(chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005535 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005536 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005537 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005538 errors, &errorHandler,
5539 "unicodeescape", "illegal Unicode character",
5540 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005541 &writer))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005542 goto onError;
5543 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005544 break;
5545
Benjamin Peterson29060642009-01-31 22:14:21 +00005546 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005547 case 'N':
5548 message = "malformed \\N character escape";
5549 if (ucnhash_CAPI == NULL) {
5550 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005551 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5552 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005553 if (ucnhash_CAPI == NULL)
5554 goto ucnhashError;
5555 }
5556 if (*s == '{') {
5557 const char *start = s+1;
5558 /* look for the closing brace */
5559 while (*s != '}' && s < end)
5560 s++;
5561 if (s > start && s < end && *s == '}') {
5562 /* found a name. look it up in the unicode database */
5563 message = "unknown Unicode character name";
5564 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005565 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005566 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005567 goto store;
5568 }
5569 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005570 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005571 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005572 errors, &errorHandler,
5573 "unicodeescape", message,
5574 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005575 &writer))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005576 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005577 break;
5578
5579 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005580 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005581 message = "\\ at end of string";
5582 s--;
5583 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005584 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005585 errors, &errorHandler,
5586 "unicodeescape", message,
5587 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005588 &writer))
Walter Dörwald8c077222002-03-25 11:16:18 +00005589 goto onError;
5590 }
5591 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005592 WRITECHAR('\\');
5593 WRITECHAR(s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005594 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005595 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005596 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005597 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005598 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005599 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005600#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005601
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005602 Py_XDECREF(errorHandler);
5603 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005604 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00005605
Benjamin Peterson29060642009-01-31 22:14:21 +00005606 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005607 PyErr_SetString(
5608 PyExc_UnicodeError,
5609 "\\N escapes not supported (can't load unicodedata module)"
5610 );
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005611 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005612 Py_XDECREF(errorHandler);
5613 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005614 return NULL;
5615
Benjamin Peterson29060642009-01-31 22:14:21 +00005616 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005617 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005618 Py_XDECREF(errorHandler);
5619 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005620 return NULL;
5621}
5622
5623/* Return a Unicode-Escape string version of the Unicode object.
5624
5625 If quotes is true, the string is enclosed in u"" or u'' quotes as
5626 appropriate.
5627
5628*/
5629
Alexander Belopolsky40018472011-02-26 01:02:56 +00005630PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005631PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005632{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005633 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005634 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005635 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005636 int kind;
5637 void *data;
5638 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005639
Ezio Melottie7f90372012-10-05 03:33:31 +03005640 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00005641 escape.
5642
Ezio Melottie7f90372012-10-05 03:33:31 +03005643 For UCS1 strings it's '\xxx', 4 bytes per source character.
5644 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5645 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00005646 */
5647
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005648 if (!PyUnicode_Check(unicode)) {
5649 PyErr_BadArgument();
5650 return NULL;
5651 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005652 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005653 return NULL;
5654 len = PyUnicode_GET_LENGTH(unicode);
5655 kind = PyUnicode_KIND(unicode);
5656 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06005657 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005658 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5659 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5660 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5661 }
5662
5663 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005664 return PyBytes_FromStringAndSize(NULL, 0);
5665
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005666 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005667 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005668
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005669 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005670 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005671 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005672 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005673 if (repr == NULL)
5674 return NULL;
5675
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005676 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005677
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005678 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005679 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005680
Walter Dörwald79e913e2007-05-12 11:08:06 +00005681 /* Escape backslashes */
5682 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005683 *p++ = '\\';
5684 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005685 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005686 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005687
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005688 /* Map 21-bit characters to '\U00xxxxxx' */
5689 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005690 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005691 *p++ = '\\';
5692 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005693 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5694 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5695 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5696 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5697 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5698 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5699 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5700 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005701 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005702 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005703
Guido van Rossumd57fd912000-03-10 22:53:23 +00005704 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005705 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005706 *p++ = '\\';
5707 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005708 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5709 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5710 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5711 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005712 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005713
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005714 /* Map special whitespace to '\t', \n', '\r' */
5715 else if (ch == '\t') {
5716 *p++ = '\\';
5717 *p++ = 't';
5718 }
5719 else if (ch == '\n') {
5720 *p++ = '\\';
5721 *p++ = 'n';
5722 }
5723 else if (ch == '\r') {
5724 *p++ = '\\';
5725 *p++ = 'r';
5726 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005727
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005728 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005729 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005730 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005731 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005732 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5733 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005734 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005735
Guido van Rossumd57fd912000-03-10 22:53:23 +00005736 /* Copy everything else as-is */
5737 else
5738 *p++ = (char) ch;
5739 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005740
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005741 assert(p - PyBytes_AS_STRING(repr) > 0);
5742 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5743 return NULL;
5744 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005745}
5746
Alexander Belopolsky40018472011-02-26 01:02:56 +00005747PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005748PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5749 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005750{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005751 PyObject *result;
5752 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5753 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005754 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005755 result = PyUnicode_AsUnicodeEscapeString(tmp);
5756 Py_DECREF(tmp);
5757 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005758}
5759
5760/* --- Raw Unicode Escape Codec ------------------------------------------- */
5761
Alexander Belopolsky40018472011-02-26 01:02:56 +00005762PyObject *
5763PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005764 Py_ssize_t size,
5765 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005766{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005767 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005768 Py_ssize_t startinpos;
5769 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005770 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005771 const char *end;
5772 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005773 PyObject *errorHandler = NULL;
5774 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005775
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005776 if (size == 0) {
5777 Py_INCREF(unicode_empty);
5778 return unicode_empty;
5779 }
5780
Guido van Rossumd57fd912000-03-10 22:53:23 +00005781 /* Escaped strings will always be longer than the resulting
5782 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005783 length after conversion to the true value. (But decoding error
5784 handler might have to resize the string) */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005785 _PyUnicodeWriter_Init(&writer, 1);
5786 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00005787 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005788
Guido van Rossumd57fd912000-03-10 22:53:23 +00005789 end = s + size;
5790 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005791 unsigned char c;
5792 Py_UCS4 x;
5793 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005794 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005795
Benjamin Peterson29060642009-01-31 22:14:21 +00005796 /* Non-escape characters are interpreted as Unicode ordinals */
5797 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005798 x = (unsigned char)*s++;
5799 if (_PyUnicodeWriter_Prepare(&writer, 1, x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005800 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005801 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, x);
5802 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00005803 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005804 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005805 startinpos = s-starts;
5806
5807 /* \u-escapes are only interpreted iff the number of leading
5808 backslashes if odd */
5809 bs = s;
5810 for (;s < end;) {
5811 if (*s != '\\')
5812 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005813 x = (unsigned char)*s++;
5814 if (_PyUnicodeWriter_Prepare(&writer, 1, x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005815 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005816 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, x);
5817 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00005818 }
5819 if (((s - bs) & 1) == 0 ||
5820 s >= end ||
5821 (*s != 'u' && *s != 'U')) {
5822 continue;
5823 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005824 writer.pos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00005825 count = *s=='u' ? 4 : 8;
5826 s++;
5827
5828 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00005829 for (x = 0, i = 0; i < count; ++i, ++s) {
5830 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005831 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005832 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005833 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005834 errors, &errorHandler,
5835 "rawunicodeescape", "truncated \\uXXXX",
5836 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005837 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005838 goto onError;
5839 goto nextByte;
5840 }
5841 x = (x<<4) & ~0xF;
5842 if (c >= '0' && c <= '9')
5843 x += c - '0';
5844 else if (c >= 'a' && c <= 'f')
5845 x += 10 + c - 'a';
5846 else
5847 x += 10 + c - 'A';
5848 }
Victor Stinner8faf8212011-12-08 22:14:11 +01005849 if (x <= MAX_UNICODE) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005850 if (_PyUnicodeWriter_Prepare(&writer, 1, x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005851 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005852 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, x);
5853 writer.pos++;
5854 }
5855 else {
Christian Heimesfe337bf2008-03-23 21:54:12 +00005856 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005857 if (unicode_decode_call_errorhandler_writer(
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005858 errors, &errorHandler,
5859 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005860 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005861 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005862 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005863 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005864 nextByte:
5865 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005866 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005867 Py_XDECREF(errorHandler);
5868 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005869 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00005870
Benjamin Peterson29060642009-01-31 22:14:21 +00005871 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005872 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005873 Py_XDECREF(errorHandler);
5874 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005875 return NULL;
5876}
5877
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005878
Alexander Belopolsky40018472011-02-26 01:02:56 +00005879PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005880PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005881{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005882 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005883 char *p;
5884 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005885 Py_ssize_t expandsize, pos;
5886 int kind;
5887 void *data;
5888 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005889
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005890 if (!PyUnicode_Check(unicode)) {
5891 PyErr_BadArgument();
5892 return NULL;
5893 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005894 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005895 return NULL;
5896 kind = PyUnicode_KIND(unicode);
5897 data = PyUnicode_DATA(unicode);
5898 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06005899 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
5900 bytes, and 1 byte characters 4. */
5901 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01005902
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005903 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005904 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005905
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005906 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005907 if (repr == NULL)
5908 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005909 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005910 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005911
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005912 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005913 for (pos = 0; pos < len; pos++) {
5914 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00005915 /* Map 32-bit characters to '\Uxxxxxxxx' */
5916 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005917 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005918 *p++ = '\\';
5919 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005920 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
5921 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
5922 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
5923 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
5924 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
5925 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
5926 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
5927 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005928 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005929 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005930 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005931 *p++ = '\\';
5932 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005933 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
5934 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
5935 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
5936 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005937 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005938 /* Copy everything else as-is */
5939 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00005940 *p++ = (char) ch;
5941 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005942
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005943 assert(p > q);
5944 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005945 return NULL;
5946 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005947}
5948
Alexander Belopolsky40018472011-02-26 01:02:56 +00005949PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005950PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
5951 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005952{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005953 PyObject *result;
5954 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5955 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00005956 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005957 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
5958 Py_DECREF(tmp);
5959 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005960}
5961
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005962/* --- Unicode Internal Codec ------------------------------------------- */
5963
Alexander Belopolsky40018472011-02-26 01:02:56 +00005964PyObject *
5965_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005966 Py_ssize_t size,
5967 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005968{
5969 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005970 Py_ssize_t startinpos;
5971 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005972 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005973 const char *end;
5974 const char *reason;
5975 PyObject *errorHandler = NULL;
5976 PyObject *exc = NULL;
5977
Victor Stinner9f4b1e92011-11-10 20:56:30 +01005978 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02005979 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01005980 1))
5981 return NULL;
5982
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005983 if (size == 0) {
5984 Py_INCREF(unicode_empty);
5985 return unicode_empty;
5986 }
5987
Thomas Wouters89f507f2006-12-13 04:49:30 +00005988 /* XXX overflow detection missing */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005989 _PyUnicodeWriter_Init(&writer, 0);
5990 if (_PyUnicodeWriter_Prepare(&writer, (size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00005991 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005992 end = s + size;
5993
5994 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01005995 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01005996 Py_UCS4 ch;
5997 /* We copy the raw representation one byte at a time because the
5998 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01005999 ((char *) &uch)[0] = s[0];
6000 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006001#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006002 ((char *) &uch)[2] = s[2];
6003 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006004#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006005 ch = uch;
6006
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006007 /* We have to sanity check the raw data, otherwise doom looms for
6008 some malformed UCS-4 data. */
6009 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006010#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006011 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006012#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006013 end-s < Py_UNICODE_SIZE
6014 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006015 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006016 startinpos = s - starts;
6017 if (end-s < Py_UNICODE_SIZE) {
6018 endinpos = end-starts;
6019 reason = "truncated input";
6020 }
6021 else {
6022 endinpos = s - starts + Py_UNICODE_SIZE;
6023 reason = "illegal code point (> 0x10FFFF)";
6024 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006025 if (unicode_decode_call_errorhandler_writer(
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006026 errors, &errorHandler,
6027 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006028 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006029 &writer))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006030 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006031 continue;
6032 }
6033
6034 s += Py_UNICODE_SIZE;
6035#ifndef Py_UNICODE_WIDE
Victor Stinner551ac952011-11-29 22:58:13 +01006036 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006037 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006038 Py_UNICODE uch2;
6039 ((char *) &uch2)[0] = s[0];
6040 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006041 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006042 {
Victor Stinner551ac952011-11-29 22:58:13 +01006043 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006044 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006045 }
6046 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006047#endif
6048
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006049 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006050 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006051 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
6052 writer.pos++;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006053 }
6054
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006055 Py_XDECREF(errorHandler);
6056 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006057 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006058
Benjamin Peterson29060642009-01-31 22:14:21 +00006059 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006060 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006061 Py_XDECREF(errorHandler);
6062 Py_XDECREF(exc);
6063 return NULL;
6064}
6065
Guido van Rossumd57fd912000-03-10 22:53:23 +00006066/* --- Latin-1 Codec ------------------------------------------------------ */
6067
Alexander Belopolsky40018472011-02-26 01:02:56 +00006068PyObject *
6069PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006070 Py_ssize_t size,
6071 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006072{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006073 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006074 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006075}
6076
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006077/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006078static void
6079make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006080 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006081 PyObject *unicode,
6082 Py_ssize_t startpos, Py_ssize_t endpos,
6083 const char *reason)
6084{
6085 if (*exceptionObject == NULL) {
6086 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006087 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006088 encoding, unicode, startpos, endpos, reason);
6089 }
6090 else {
6091 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6092 goto onError;
6093 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6094 goto onError;
6095 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6096 goto onError;
6097 return;
6098 onError:
6099 Py_DECREF(*exceptionObject);
6100 *exceptionObject = NULL;
6101 }
6102}
6103
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006104/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006105static void
6106raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006107 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006108 PyObject *unicode,
6109 Py_ssize_t startpos, Py_ssize_t endpos,
6110 const char *reason)
6111{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006112 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006113 encoding, unicode, startpos, endpos, reason);
6114 if (*exceptionObject != NULL)
6115 PyCodec_StrictErrors(*exceptionObject);
6116}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006117
6118/* error handling callback helper:
6119 build arguments, call the callback and check the arguments,
6120 put the result into newpos and return the replacement string, which
6121 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006122static PyObject *
6123unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006124 PyObject **errorHandler,
6125 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006126 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006127 Py_ssize_t startpos, Py_ssize_t endpos,
6128 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006129{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006130 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006131 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006132 PyObject *restuple;
6133 PyObject *resunicode;
6134
6135 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006136 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006137 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006138 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006139 }
6140
Benjamin Petersonbac79492012-01-14 13:34:47 -05006141 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006142 return NULL;
6143 len = PyUnicode_GET_LENGTH(unicode);
6144
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006145 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006146 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006147 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006148 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006149
6150 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006151 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006152 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006153 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006154 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006155 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006156 Py_DECREF(restuple);
6157 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006158 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006159 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006160 &resunicode, newpos)) {
6161 Py_DECREF(restuple);
6162 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006163 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006164 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6165 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6166 Py_DECREF(restuple);
6167 return NULL;
6168 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006169 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006170 *newpos = len + *newpos;
6171 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006172 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6173 Py_DECREF(restuple);
6174 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006175 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006176 Py_INCREF(resunicode);
6177 Py_DECREF(restuple);
6178 return resunicode;
6179}
6180
Alexander Belopolsky40018472011-02-26 01:02:56 +00006181static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006182unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006183 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006184 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006185{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006186 /* input state */
6187 Py_ssize_t pos=0, size;
6188 int kind;
6189 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006190 /* output object */
6191 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006192 /* pointer into the output */
6193 char *str;
6194 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006195 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006196 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6197 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006198 PyObject *errorHandler = NULL;
6199 PyObject *exc = NULL;
6200 /* the following variable is used for caching string comparisons
6201 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6202 int known_errorHandler = -1;
6203
Benjamin Petersonbac79492012-01-14 13:34:47 -05006204 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006205 return NULL;
6206 size = PyUnicode_GET_LENGTH(unicode);
6207 kind = PyUnicode_KIND(unicode);
6208 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006209 /* allocate enough for a simple encoding without
6210 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006211 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006212 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006213 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006214 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006215 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006216 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006217 ressize = size;
6218
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006219 while (pos < size) {
6220 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006221
Benjamin Peterson29060642009-01-31 22:14:21 +00006222 /* can we encode this? */
6223 if (c<limit) {
6224 /* no overflow check, because we know that the space is enough */
6225 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006226 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006227 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006228 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006229 Py_ssize_t requiredsize;
6230 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006231 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006232 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006233 Py_ssize_t collstart = pos;
6234 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006235 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006236 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006237 ++collend;
6238 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6239 if (known_errorHandler==-1) {
6240 if ((errors==NULL) || (!strcmp(errors, "strict")))
6241 known_errorHandler = 1;
6242 else if (!strcmp(errors, "replace"))
6243 known_errorHandler = 2;
6244 else if (!strcmp(errors, "ignore"))
6245 known_errorHandler = 3;
6246 else if (!strcmp(errors, "xmlcharrefreplace"))
6247 known_errorHandler = 4;
6248 else
6249 known_errorHandler = 0;
6250 }
6251 switch (known_errorHandler) {
6252 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006253 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006254 goto onError;
6255 case 2: /* replace */
6256 while (collstart++<collend)
6257 *str++ = '?'; /* fall through */
6258 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006259 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006260 break;
6261 case 4: /* xmlcharrefreplace */
6262 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006263 /* determine replacement size */
6264 for (i = collstart, repsize = 0; i < collend; ++i) {
6265 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6266 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006267 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006268 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006269 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006270 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006271 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006272 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006273 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006274 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006275 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006276 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006277 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006278 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006279 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006280 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006281 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006282 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006283 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006284 if (requiredsize > ressize) {
6285 if (requiredsize<2*ressize)
6286 requiredsize = 2*ressize;
6287 if (_PyBytes_Resize(&res, requiredsize))
6288 goto onError;
6289 str = PyBytes_AS_STRING(res) + respos;
6290 ressize = requiredsize;
6291 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006292 /* generate replacement */
6293 for (i = collstart; i < collend; ++i) {
6294 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006295 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006296 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006297 break;
6298 default:
6299 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006300 encoding, reason, unicode, &exc,
6301 collstart, collend, &newpos);
6302 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006303 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006304 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006305 if (PyBytes_Check(repunicode)) {
6306 /* Directly copy bytes result to output. */
6307 repsize = PyBytes_Size(repunicode);
6308 if (repsize > 1) {
6309 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006310 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006311 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6312 Py_DECREF(repunicode);
6313 goto onError;
6314 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006315 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006316 ressize += repsize-1;
6317 }
6318 memcpy(str, PyBytes_AsString(repunicode), repsize);
6319 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006320 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006321 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006322 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006323 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006324 /* need more space? (at least enough for what we
6325 have+the replacement+the rest of the string, so
6326 we won't have to check space for encodable characters) */
6327 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006328 repsize = PyUnicode_GET_LENGTH(repunicode);
6329 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006330 if (requiredsize > ressize) {
6331 if (requiredsize<2*ressize)
6332 requiredsize = 2*ressize;
6333 if (_PyBytes_Resize(&res, requiredsize)) {
6334 Py_DECREF(repunicode);
6335 goto onError;
6336 }
6337 str = PyBytes_AS_STRING(res) + respos;
6338 ressize = requiredsize;
6339 }
6340 /* check if there is anything unencodable in the replacement
6341 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006342 for (i = 0; repsize-->0; ++i, ++str) {
6343 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006344 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006345 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006346 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006347 Py_DECREF(repunicode);
6348 goto onError;
6349 }
6350 *str = (char)c;
6351 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006352 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006353 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006354 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006355 }
6356 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006357 /* Resize if we allocated to much */
6358 size = str - PyBytes_AS_STRING(res);
6359 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006360 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006361 if (_PyBytes_Resize(&res, size) < 0)
6362 goto onError;
6363 }
6364
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006365 Py_XDECREF(errorHandler);
6366 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006367 return res;
6368
6369 onError:
6370 Py_XDECREF(res);
6371 Py_XDECREF(errorHandler);
6372 Py_XDECREF(exc);
6373 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006374}
6375
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006376/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006377PyObject *
6378PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006379 Py_ssize_t size,
6380 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006381{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006382 PyObject *result;
6383 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6384 if (unicode == NULL)
6385 return NULL;
6386 result = unicode_encode_ucs1(unicode, errors, 256);
6387 Py_DECREF(unicode);
6388 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006389}
6390
Alexander Belopolsky40018472011-02-26 01:02:56 +00006391PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006392_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006393{
6394 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006395 PyErr_BadArgument();
6396 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006397 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006398 if (PyUnicode_READY(unicode) == -1)
6399 return NULL;
6400 /* Fast path: if it is a one-byte string, construct
6401 bytes object directly. */
6402 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6403 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6404 PyUnicode_GET_LENGTH(unicode));
6405 /* Non-Latin-1 characters present. Defer to above function to
6406 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006407 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006408}
6409
6410PyObject*
6411PyUnicode_AsLatin1String(PyObject *unicode)
6412{
6413 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006414}
6415
6416/* --- 7-bit ASCII Codec -------------------------------------------------- */
6417
Alexander Belopolsky40018472011-02-26 01:02:56 +00006418PyObject *
6419PyUnicode_DecodeASCII(const char *s,
6420 Py_ssize_t size,
6421 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006422{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006423 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006424 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006425 int kind;
6426 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006427 Py_ssize_t startinpos;
6428 Py_ssize_t endinpos;
6429 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006430 const char *e;
6431 PyObject *errorHandler = NULL;
6432 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006433
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006434 if (size == 0) {
6435 Py_INCREF(unicode_empty);
6436 return unicode_empty;
6437 }
6438
Guido van Rossumd57fd912000-03-10 22:53:23 +00006439 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006440 if (size == 1 && (unsigned char)s[0] < 128)
6441 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006442
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006443 _PyUnicodeWriter_Init(&writer, 0);
6444 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00006445 goto onError;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006446
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006447 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006448 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006449 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006450 writer.pos = outpos;
6451 if (writer.pos == size)
6452 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006453
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006454 s += writer.pos;
6455 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006456 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006457 register unsigned char c = (unsigned char)*s;
6458 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006459 PyUnicode_WRITE(kind, data, writer.pos, c);
6460 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006461 ++s;
6462 }
6463 else {
6464 startinpos = s-starts;
6465 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006466 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006467 errors, &errorHandler,
6468 "ascii", "ordinal not in range(128)",
6469 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006470 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006471 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006472 kind = writer.kind;
6473 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00006474 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006475 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006476 Py_XDECREF(errorHandler);
6477 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006478 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006479
Benjamin Peterson29060642009-01-31 22:14:21 +00006480 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006481 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006482 Py_XDECREF(errorHandler);
6483 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006484 return NULL;
6485}
6486
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006487/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006488PyObject *
6489PyUnicode_EncodeASCII(const Py_UNICODE *p,
6490 Py_ssize_t size,
6491 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006492{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006493 PyObject *result;
6494 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6495 if (unicode == NULL)
6496 return NULL;
6497 result = unicode_encode_ucs1(unicode, errors, 128);
6498 Py_DECREF(unicode);
6499 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006500}
6501
Alexander Belopolsky40018472011-02-26 01:02:56 +00006502PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006503_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006504{
6505 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006506 PyErr_BadArgument();
6507 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006508 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006509 if (PyUnicode_READY(unicode) == -1)
6510 return NULL;
6511 /* Fast path: if it is an ASCII-only string, construct bytes object
6512 directly. Else defer to above function to raise the exception. */
6513 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6514 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6515 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006516 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006517}
6518
6519PyObject *
6520PyUnicode_AsASCIIString(PyObject *unicode)
6521{
6522 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006523}
6524
Victor Stinner99b95382011-07-04 14:23:54 +02006525#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006526
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006527/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006528
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006529#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006530#define NEED_RETRY
6531#endif
6532
Victor Stinner3a50e702011-10-18 21:21:00 +02006533#ifndef WC_ERR_INVALID_CHARS
6534# define WC_ERR_INVALID_CHARS 0x0080
6535#endif
6536
6537static char*
6538code_page_name(UINT code_page, PyObject **obj)
6539{
6540 *obj = NULL;
6541 if (code_page == CP_ACP)
6542 return "mbcs";
6543 if (code_page == CP_UTF7)
6544 return "CP_UTF7";
6545 if (code_page == CP_UTF8)
6546 return "CP_UTF8";
6547
6548 *obj = PyBytes_FromFormat("cp%u", code_page);
6549 if (*obj == NULL)
6550 return NULL;
6551 return PyBytes_AS_STRING(*obj);
6552}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006553
Alexander Belopolsky40018472011-02-26 01:02:56 +00006554static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006555is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006556{
6557 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006558 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006559
Victor Stinner3a50e702011-10-18 21:21:00 +02006560 if (!IsDBCSLeadByteEx(code_page, *curr))
6561 return 0;
6562
6563 prev = CharPrevExA(code_page, s, curr, 0);
6564 if (prev == curr)
6565 return 1;
6566 /* FIXME: This code is limited to "true" double-byte encodings,
6567 as it assumes an incomplete character consists of a single
6568 byte. */
6569 if (curr - prev == 2)
6570 return 1;
6571 if (!IsDBCSLeadByteEx(code_page, *prev))
6572 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006573 return 0;
6574}
6575
Victor Stinner3a50e702011-10-18 21:21:00 +02006576static DWORD
6577decode_code_page_flags(UINT code_page)
6578{
6579 if (code_page == CP_UTF7) {
6580 /* The CP_UTF7 decoder only supports flags=0 */
6581 return 0;
6582 }
6583 else
6584 return MB_ERR_INVALID_CHARS;
6585}
6586
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006587/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006588 * Decode a byte string from a Windows code page into unicode object in strict
6589 * mode.
6590 *
6591 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6592 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006593 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006594static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006595decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006596 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006597 const char *in,
6598 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006599{
Victor Stinner3a50e702011-10-18 21:21:00 +02006600 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006601 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006602 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006603
6604 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006605 assert(insize > 0);
6606 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6607 if (outsize <= 0)
6608 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006609
6610 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006611 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01006612 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006613 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006614 if (*v == NULL)
6615 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006616 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006617 }
6618 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006619 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006620 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01006621 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006622 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006623 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006624 }
6625
6626 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006627 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6628 if (outsize <= 0)
6629 goto error;
6630 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006631
Victor Stinner3a50e702011-10-18 21:21:00 +02006632error:
6633 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6634 return -2;
6635 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006636 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006637}
6638
Victor Stinner3a50e702011-10-18 21:21:00 +02006639/*
6640 * Decode a byte string from a code page into unicode object with an error
6641 * handler.
6642 *
6643 * Returns consumed size if succeed, or raise a WindowsError or
6644 * UnicodeDecodeError exception and returns -1 on error.
6645 */
6646static int
6647decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006648 PyObject **v,
6649 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006650 const char *errors)
6651{
6652 const char *startin = in;
6653 const char *endin = in + size;
6654 const DWORD flags = decode_code_page_flags(code_page);
6655 /* Ideally, we should get reason from FormatMessage. This is the Windows
6656 2000 English version of the message. */
6657 const char *reason = "No mapping for the Unicode character exists "
6658 "in the target code page.";
6659 /* each step cannot decode more than 1 character, but a character can be
6660 represented as a surrogate pair */
6661 wchar_t buffer[2], *startout, *out;
6662 int insize, outsize;
6663 PyObject *errorHandler = NULL;
6664 PyObject *exc = NULL;
6665 PyObject *encoding_obj = NULL;
6666 char *encoding;
6667 DWORD err;
6668 int ret = -1;
6669
6670 assert(size > 0);
6671
6672 encoding = code_page_name(code_page, &encoding_obj);
6673 if (encoding == NULL)
6674 return -1;
6675
6676 if (errors == NULL || strcmp(errors, "strict") == 0) {
6677 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6678 UnicodeDecodeError. */
6679 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6680 if (exc != NULL) {
6681 PyCodec_StrictErrors(exc);
6682 Py_CLEAR(exc);
6683 }
6684 goto error;
6685 }
6686
6687 if (*v == NULL) {
6688 /* Create unicode object */
6689 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6690 PyErr_NoMemory();
6691 goto error;
6692 }
Victor Stinnerab595942011-12-17 04:59:06 +01006693 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006694 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006695 if (*v == NULL)
6696 goto error;
6697 startout = PyUnicode_AS_UNICODE(*v);
6698 }
6699 else {
6700 /* Extend unicode object */
6701 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6702 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6703 PyErr_NoMemory();
6704 goto error;
6705 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006706 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006707 goto error;
6708 startout = PyUnicode_AS_UNICODE(*v) + n;
6709 }
6710
6711 /* Decode the byte string character per character */
6712 out = startout;
6713 while (in < endin)
6714 {
6715 /* Decode a character */
6716 insize = 1;
6717 do
6718 {
6719 outsize = MultiByteToWideChar(code_page, flags,
6720 in, insize,
6721 buffer, Py_ARRAY_LENGTH(buffer));
6722 if (outsize > 0)
6723 break;
6724 err = GetLastError();
6725 if (err != ERROR_NO_UNICODE_TRANSLATION
6726 && err != ERROR_INSUFFICIENT_BUFFER)
6727 {
6728 PyErr_SetFromWindowsErr(0);
6729 goto error;
6730 }
6731 insize++;
6732 }
6733 /* 4=maximum length of a UTF-8 sequence */
6734 while (insize <= 4 && (in + insize) <= endin);
6735
6736 if (outsize <= 0) {
6737 Py_ssize_t startinpos, endinpos, outpos;
6738
6739 startinpos = in - startin;
6740 endinpos = startinpos + 1;
6741 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006742 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02006743 errors, &errorHandler,
6744 encoding, reason,
6745 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01006746 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02006747 {
6748 goto error;
6749 }
Victor Stinner596a6c42011-11-09 00:02:18 +01006750 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02006751 }
6752 else {
6753 in += insize;
6754 memcpy(out, buffer, outsize * sizeof(wchar_t));
6755 out += outsize;
6756 }
6757 }
6758
6759 /* write a NUL character at the end */
6760 *out = 0;
6761
6762 /* Extend unicode object */
6763 outsize = out - startout;
6764 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01006765 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006766 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01006767 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02006768
6769error:
6770 Py_XDECREF(encoding_obj);
6771 Py_XDECREF(errorHandler);
6772 Py_XDECREF(exc);
6773 return ret;
6774}
6775
Victor Stinner3a50e702011-10-18 21:21:00 +02006776static PyObject *
6777decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006778 const char *s, Py_ssize_t size,
6779 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006780{
Victor Stinner76a31a62011-11-04 00:05:13 +01006781 PyObject *v = NULL;
6782 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006783
Victor Stinner3a50e702011-10-18 21:21:00 +02006784 if (code_page < 0) {
6785 PyErr_SetString(PyExc_ValueError, "invalid code page number");
6786 return NULL;
6787 }
6788
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006789 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006790 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006791
Victor Stinner76a31a62011-11-04 00:05:13 +01006792 do
6793 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006794#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01006795 if (size > INT_MAX) {
6796 chunk_size = INT_MAX;
6797 final = 0;
6798 done = 0;
6799 }
6800 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006801#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01006802 {
6803 chunk_size = (int)size;
6804 final = (consumed == NULL);
6805 done = 1;
6806 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006807
Victor Stinner76a31a62011-11-04 00:05:13 +01006808 /* Skip trailing lead-byte unless 'final' is set */
6809 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
6810 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006811
Victor Stinner76a31a62011-11-04 00:05:13 +01006812 if (chunk_size == 0 && done) {
6813 if (v != NULL)
6814 break;
6815 Py_INCREF(unicode_empty);
6816 return unicode_empty;
6817 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006818
Victor Stinner76a31a62011-11-04 00:05:13 +01006819
6820 converted = decode_code_page_strict(code_page, &v,
6821 s, chunk_size);
6822 if (converted == -2)
6823 converted = decode_code_page_errors(code_page, &v,
6824 s, chunk_size,
6825 errors);
6826 assert(converted != 0);
6827
6828 if (converted < 0) {
6829 Py_XDECREF(v);
6830 return NULL;
6831 }
6832
6833 if (consumed)
6834 *consumed += converted;
6835
6836 s += converted;
6837 size -= converted;
6838 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02006839
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006840 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006841}
6842
Alexander Belopolsky40018472011-02-26 01:02:56 +00006843PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02006844PyUnicode_DecodeCodePageStateful(int code_page,
6845 const char *s,
6846 Py_ssize_t size,
6847 const char *errors,
6848 Py_ssize_t *consumed)
6849{
6850 return decode_code_page_stateful(code_page, s, size, errors, consumed);
6851}
6852
6853PyObject *
6854PyUnicode_DecodeMBCSStateful(const char *s,
6855 Py_ssize_t size,
6856 const char *errors,
6857 Py_ssize_t *consumed)
6858{
6859 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
6860}
6861
6862PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006863PyUnicode_DecodeMBCS(const char *s,
6864 Py_ssize_t size,
6865 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006866{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006867 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6868}
6869
Victor Stinner3a50e702011-10-18 21:21:00 +02006870static DWORD
6871encode_code_page_flags(UINT code_page, const char *errors)
6872{
6873 if (code_page == CP_UTF8) {
6874 if (winver.dwMajorVersion >= 6)
6875 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
6876 and later */
6877 return WC_ERR_INVALID_CHARS;
6878 else
6879 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
6880 return 0;
6881 }
6882 else if (code_page == CP_UTF7) {
6883 /* CP_UTF7 only supports flags=0 */
6884 return 0;
6885 }
6886 else {
6887 if (errors != NULL && strcmp(errors, "replace") == 0)
6888 return 0;
6889 else
6890 return WC_NO_BEST_FIT_CHARS;
6891 }
6892}
6893
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006894/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006895 * Encode a Unicode string to a Windows code page into a byte string in strict
6896 * mode.
6897 *
6898 * Returns consumed characters if succeed, returns -2 on encode error, or raise
6899 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006900 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006901static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006902encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01006903 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02006904 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006905{
Victor Stinner554f3f02010-06-16 23:33:54 +00006906 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02006907 BOOL *pusedDefaultChar = &usedDefaultChar;
6908 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006909 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01006910 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006911 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02006912 const DWORD flags = encode_code_page_flags(code_page, NULL);
6913 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006914 /* Create a substring so that we can get the UTF-16 representation
6915 of just the slice under consideration. */
6916 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006917
Martin v. Löwis3d325192011-11-04 18:23:06 +01006918 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006919
Victor Stinner3a50e702011-10-18 21:21:00 +02006920 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00006921 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02006922 else
Victor Stinner554f3f02010-06-16 23:33:54 +00006923 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00006924
Victor Stinner2fc507f2011-11-04 20:06:39 +01006925 substring = PyUnicode_Substring(unicode, offset, offset+len);
6926 if (substring == NULL)
6927 return -1;
6928 p = PyUnicode_AsUnicodeAndSize(substring, &size);
6929 if (p == NULL) {
6930 Py_DECREF(substring);
6931 return -1;
6932 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01006933
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006934 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006935 outsize = WideCharToMultiByte(code_page, flags,
6936 p, size,
6937 NULL, 0,
6938 NULL, pusedDefaultChar);
6939 if (outsize <= 0)
6940 goto error;
6941 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01006942 if (pusedDefaultChar && *pusedDefaultChar) {
6943 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006944 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006945 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006946
Victor Stinner3a50e702011-10-18 21:21:00 +02006947 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006948 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006949 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01006950 if (*outbytes == NULL) {
6951 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00006952 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006953 }
Victor Stinner3a50e702011-10-18 21:21:00 +02006954 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006955 }
6956 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006957 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006958 const Py_ssize_t n = PyBytes_Size(*outbytes);
6959 if (outsize > PY_SSIZE_T_MAX - n) {
6960 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01006961 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00006962 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006963 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01006964 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
6965 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006966 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006967 }
Victor Stinner3a50e702011-10-18 21:21:00 +02006968 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006969 }
6970
6971 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006972 outsize = WideCharToMultiByte(code_page, flags,
6973 p, size,
6974 out, outsize,
6975 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01006976 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006977 if (outsize <= 0)
6978 goto error;
6979 if (pusedDefaultChar && *pusedDefaultChar)
6980 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006981 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006982
Victor Stinner3a50e702011-10-18 21:21:00 +02006983error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01006984 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006985 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6986 return -2;
6987 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006988 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006989}
6990
Victor Stinner3a50e702011-10-18 21:21:00 +02006991/*
6992 * Encode a Unicode string to a Windows code page into a byte string using a
6993 * error handler.
6994 *
6995 * Returns consumed characters if succeed, or raise a WindowsError and returns
6996 * -1 on other error.
6997 */
6998static int
6999encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007000 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007001 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007002{
Victor Stinner3a50e702011-10-18 21:21:00 +02007003 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007004 Py_ssize_t pos = unicode_offset;
7005 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007006 /* Ideally, we should get reason from FormatMessage. This is the Windows
7007 2000 English version of the message. */
7008 const char *reason = "invalid character";
7009 /* 4=maximum length of a UTF-8 sequence */
7010 char buffer[4];
7011 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7012 Py_ssize_t outsize;
7013 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007014 PyObject *errorHandler = NULL;
7015 PyObject *exc = NULL;
7016 PyObject *encoding_obj = NULL;
7017 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007018 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007019 PyObject *rep;
7020 int ret = -1;
7021
7022 assert(insize > 0);
7023
7024 encoding = code_page_name(code_page, &encoding_obj);
7025 if (encoding == NULL)
7026 return -1;
7027
7028 if (errors == NULL || strcmp(errors, "strict") == 0) {
7029 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7030 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007031 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007032 if (exc != NULL) {
7033 PyCodec_StrictErrors(exc);
7034 Py_DECREF(exc);
7035 }
7036 Py_XDECREF(encoding_obj);
7037 return -1;
7038 }
7039
7040 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7041 pusedDefaultChar = &usedDefaultChar;
7042 else
7043 pusedDefaultChar = NULL;
7044
7045 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7046 PyErr_NoMemory();
7047 goto error;
7048 }
7049 outsize = insize * Py_ARRAY_LENGTH(buffer);
7050
7051 if (*outbytes == NULL) {
7052 /* Create string object */
7053 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7054 if (*outbytes == NULL)
7055 goto error;
7056 out = PyBytes_AS_STRING(*outbytes);
7057 }
7058 else {
7059 /* Extend string object */
7060 Py_ssize_t n = PyBytes_Size(*outbytes);
7061 if (n > PY_SSIZE_T_MAX - outsize) {
7062 PyErr_NoMemory();
7063 goto error;
7064 }
7065 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7066 goto error;
7067 out = PyBytes_AS_STRING(*outbytes) + n;
7068 }
7069
7070 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007071 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007072 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007073 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7074 wchar_t chars[2];
7075 int charsize;
7076 if (ch < 0x10000) {
7077 chars[0] = (wchar_t)ch;
7078 charsize = 1;
7079 }
7080 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007081 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7082 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007083 charsize = 2;
7084 }
7085
Victor Stinner3a50e702011-10-18 21:21:00 +02007086 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007087 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007088 buffer, Py_ARRAY_LENGTH(buffer),
7089 NULL, pusedDefaultChar);
7090 if (outsize > 0) {
7091 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7092 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007093 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007094 memcpy(out, buffer, outsize);
7095 out += outsize;
7096 continue;
7097 }
7098 }
7099 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7100 PyErr_SetFromWindowsErr(0);
7101 goto error;
7102 }
7103
Victor Stinner3a50e702011-10-18 21:21:00 +02007104 rep = unicode_encode_call_errorhandler(
7105 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007106 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007107 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007108 if (rep == NULL)
7109 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007110 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007111
7112 if (PyBytes_Check(rep)) {
7113 outsize = PyBytes_GET_SIZE(rep);
7114 if (outsize != 1) {
7115 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7116 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7117 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7118 Py_DECREF(rep);
7119 goto error;
7120 }
7121 out = PyBytes_AS_STRING(*outbytes) + offset;
7122 }
7123 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7124 out += outsize;
7125 }
7126 else {
7127 Py_ssize_t i;
7128 enum PyUnicode_Kind kind;
7129 void *data;
7130
Benjamin Petersonbac79492012-01-14 13:34:47 -05007131 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007132 Py_DECREF(rep);
7133 goto error;
7134 }
7135
7136 outsize = PyUnicode_GET_LENGTH(rep);
7137 if (outsize != 1) {
7138 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7139 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7140 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7141 Py_DECREF(rep);
7142 goto error;
7143 }
7144 out = PyBytes_AS_STRING(*outbytes) + offset;
7145 }
7146 kind = PyUnicode_KIND(rep);
7147 data = PyUnicode_DATA(rep);
7148 for (i=0; i < outsize; i++) {
7149 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7150 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007151 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007152 encoding, unicode,
7153 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007154 "unable to encode error handler result to ASCII");
7155 Py_DECREF(rep);
7156 goto error;
7157 }
7158 *out = (unsigned char)ch;
7159 out++;
7160 }
7161 }
7162 Py_DECREF(rep);
7163 }
7164 /* write a NUL byte */
7165 *out = 0;
7166 outsize = out - PyBytes_AS_STRING(*outbytes);
7167 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7168 if (_PyBytes_Resize(outbytes, outsize) < 0)
7169 goto error;
7170 ret = 0;
7171
7172error:
7173 Py_XDECREF(encoding_obj);
7174 Py_XDECREF(errorHandler);
7175 Py_XDECREF(exc);
7176 return ret;
7177}
7178
Victor Stinner3a50e702011-10-18 21:21:00 +02007179static PyObject *
7180encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007181 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007182 const char *errors)
7183{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007184 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007185 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007186 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007187 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007188
Benjamin Petersonbac79492012-01-14 13:34:47 -05007189 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007190 return NULL;
7191 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007192
Victor Stinner3a50e702011-10-18 21:21:00 +02007193 if (code_page < 0) {
7194 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7195 return NULL;
7196 }
7197
Martin v. Löwis3d325192011-11-04 18:23:06 +01007198 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007199 return PyBytes_FromStringAndSize(NULL, 0);
7200
Victor Stinner7581cef2011-11-03 22:32:33 +01007201 offset = 0;
7202 do
7203 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007204#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007205 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007206 chunks. */
7207 if (len > INT_MAX/2) {
7208 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007209 done = 0;
7210 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007211 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007212#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007213 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007214 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007215 done = 1;
7216 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007217
Victor Stinner76a31a62011-11-04 00:05:13 +01007218 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007219 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007220 errors);
7221 if (ret == -2)
7222 ret = encode_code_page_errors(code_page, &outbytes,
7223 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007224 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007225 if (ret < 0) {
7226 Py_XDECREF(outbytes);
7227 return NULL;
7228 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007229
Victor Stinner7581cef2011-11-03 22:32:33 +01007230 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007231 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007232 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007233
Victor Stinner3a50e702011-10-18 21:21:00 +02007234 return outbytes;
7235}
7236
7237PyObject *
7238PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7239 Py_ssize_t size,
7240 const char *errors)
7241{
Victor Stinner7581cef2011-11-03 22:32:33 +01007242 PyObject *unicode, *res;
7243 unicode = PyUnicode_FromUnicode(p, size);
7244 if (unicode == NULL)
7245 return NULL;
7246 res = encode_code_page(CP_ACP, unicode, errors);
7247 Py_DECREF(unicode);
7248 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007249}
7250
7251PyObject *
7252PyUnicode_EncodeCodePage(int code_page,
7253 PyObject *unicode,
7254 const char *errors)
7255{
Victor Stinner7581cef2011-11-03 22:32:33 +01007256 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007257}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007258
Alexander Belopolsky40018472011-02-26 01:02:56 +00007259PyObject *
7260PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007261{
7262 if (!PyUnicode_Check(unicode)) {
7263 PyErr_BadArgument();
7264 return NULL;
7265 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007266 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007267}
7268
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007269#undef NEED_RETRY
7270
Victor Stinner99b95382011-07-04 14:23:54 +02007271#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007272
Guido van Rossumd57fd912000-03-10 22:53:23 +00007273/* --- Character Mapping Codec -------------------------------------------- */
7274
Alexander Belopolsky40018472011-02-26 01:02:56 +00007275PyObject *
7276PyUnicode_DecodeCharmap(const char *s,
7277 Py_ssize_t size,
7278 PyObject *mapping,
7279 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007280{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007281 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007282 Py_ssize_t startinpos;
7283 Py_ssize_t endinpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007284 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007285 _PyUnicodeWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007286 PyObject *errorHandler = NULL;
7287 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007288
Guido van Rossumd57fd912000-03-10 22:53:23 +00007289 /* Default to Latin-1 */
7290 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007291 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007292
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007293 if (size == 0) {
7294 Py_INCREF(unicode_empty);
7295 return unicode_empty;
7296 }
7297 _PyUnicodeWriter_Init(&writer, 0);
7298 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007299 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007300
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007301 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007302 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007303 Py_ssize_t maplen;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007304 enum PyUnicode_Kind mapkind;
7305 void *mapdata;
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007306 Py_UCS4 x;
7307
Benjamin Petersonbac79492012-01-14 13:34:47 -05007308 if (PyUnicode_READY(mapping) == -1)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007309 return NULL;
7310
7311 maplen = PyUnicode_GET_LENGTH(mapping);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007312 mapdata = PyUnicode_DATA(mapping);
7313 mapkind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007314 while (s < e) {
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007315 unsigned char ch;
7316 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007317 enum PyUnicode_Kind outkind = writer.kind;
7318 void *outdata = writer.data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007319 if (outkind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007320 Py_UCS4 maxchar = writer.maxchar;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007321 while (s < e) {
7322 unsigned char ch = *s;
7323 x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch);
7324 if (x > maxchar)
7325 goto Error;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007326 PyUnicode_WRITE(PyUnicode_1BYTE_KIND, outdata, writer.pos, x);
7327 writer.pos++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007328 ++s;
7329 }
7330 break;
7331 }
7332 else if (outkind == PyUnicode_2BYTE_KIND) {
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007333 while (s < e) {
7334 unsigned char ch = *s;
7335 x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch);
7336 if (x == 0xFFFE)
7337 goto Error;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007338 PyUnicode_WRITE(PyUnicode_2BYTE_KIND, outdata, writer.pos, x);
7339 writer.pos++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007340 ++s;
7341 }
7342 break;
7343 }
7344 }
7345 ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007346
Benjamin Peterson29060642009-01-31 22:14:21 +00007347 if (ch < maplen)
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007348 x = PyUnicode_READ(mapkind, mapdata, ch);
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007349 else
7350 x = 0xfffe; /* invalid value */
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007351Error:
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007352 if (x == 0xfffe)
7353 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007354 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007355 startinpos = s-starts;
7356 endinpos = startinpos+1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007357 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00007358 errors, &errorHandler,
7359 "charmap", "character maps to <undefined>",
7360 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007361 &writer)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007362 goto onError;
7363 }
7364 continue;
7365 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007366
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007367 if (_PyUnicodeWriter_Prepare(&writer, 1, x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007368 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007369 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, x);
7370 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007371 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007372 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007373 }
7374 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007375 while (s < e) {
7376 unsigned char ch = *s;
7377 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007378
Benjamin Peterson29060642009-01-31 22:14:21 +00007379 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7380 w = PyLong_FromLong((long)ch);
7381 if (w == NULL)
7382 goto onError;
7383 x = PyObject_GetItem(mapping, w);
7384 Py_DECREF(w);
7385 if (x == NULL) {
7386 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7387 /* No mapping found means: mapping is undefined. */
7388 PyErr_Clear();
7389 x = Py_None;
7390 Py_INCREF(x);
7391 } else
7392 goto onError;
7393 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007394
Benjamin Peterson29060642009-01-31 22:14:21 +00007395 /* Apply mapping */
7396 if (PyLong_Check(x)) {
7397 long value = PyLong_AS_LONG(x);
Antoine Pitroua1f76552012-09-23 20:00:04 +02007398 if (value < 0 || value > MAX_UNICODE) {
7399 PyErr_Format(PyExc_TypeError,
7400 "character mapping must be in range(0x%lx)",
7401 (unsigned long)MAX_UNICODE + 1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007402 Py_DECREF(x);
7403 goto onError;
7404 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007405
7406 if (_PyUnicodeWriter_Prepare(&writer, 1, value) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007407 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007408 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, value);
7409 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007410 }
7411 else if (x == Py_None) {
7412 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007413 startinpos = s-starts;
7414 endinpos = startinpos+1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007415 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00007416 errors, &errorHandler,
7417 "charmap", "character maps to <undefined>",
7418 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007419 &writer)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007420 Py_DECREF(x);
7421 goto onError;
7422 }
7423 Py_DECREF(x);
7424 continue;
7425 }
7426 else if (PyUnicode_Check(x)) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007427 writer.overallocate = 1;
7428 if (_PyUnicodeWriter_WriteStr(&writer, x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007429 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007430 }
7431 else {
7432 /* wrong return value */
7433 PyErr_SetString(PyExc_TypeError,
7434 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007435 Py_DECREF(x);
7436 goto onError;
7437 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007438 Py_DECREF(x);
7439 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007440 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007441 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007442 Py_XDECREF(errorHandler);
7443 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007444 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007445
Benjamin Peterson29060642009-01-31 22:14:21 +00007446 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007447 Py_XDECREF(errorHandler);
7448 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007449 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007450 return NULL;
7451}
7452
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007453/* Charmap encoding: the lookup table */
7454
Alexander Belopolsky40018472011-02-26 01:02:56 +00007455struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007456 PyObject_HEAD
7457 unsigned char level1[32];
7458 int count2, count3;
7459 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007460};
7461
7462static PyObject*
7463encoding_map_size(PyObject *obj, PyObject* args)
7464{
7465 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007466 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007467 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007468}
7469
7470static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007471 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007472 PyDoc_STR("Return the size (in bytes) of this object") },
7473 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007474};
7475
7476static void
7477encoding_map_dealloc(PyObject* o)
7478{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007479 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007480}
7481
7482static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007483 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007484 "EncodingMap", /*tp_name*/
7485 sizeof(struct encoding_map), /*tp_basicsize*/
7486 0, /*tp_itemsize*/
7487 /* methods */
7488 encoding_map_dealloc, /*tp_dealloc*/
7489 0, /*tp_print*/
7490 0, /*tp_getattr*/
7491 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007492 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007493 0, /*tp_repr*/
7494 0, /*tp_as_number*/
7495 0, /*tp_as_sequence*/
7496 0, /*tp_as_mapping*/
7497 0, /*tp_hash*/
7498 0, /*tp_call*/
7499 0, /*tp_str*/
7500 0, /*tp_getattro*/
7501 0, /*tp_setattro*/
7502 0, /*tp_as_buffer*/
7503 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7504 0, /*tp_doc*/
7505 0, /*tp_traverse*/
7506 0, /*tp_clear*/
7507 0, /*tp_richcompare*/
7508 0, /*tp_weaklistoffset*/
7509 0, /*tp_iter*/
7510 0, /*tp_iternext*/
7511 encoding_map_methods, /*tp_methods*/
7512 0, /*tp_members*/
7513 0, /*tp_getset*/
7514 0, /*tp_base*/
7515 0, /*tp_dict*/
7516 0, /*tp_descr_get*/
7517 0, /*tp_descr_set*/
7518 0, /*tp_dictoffset*/
7519 0, /*tp_init*/
7520 0, /*tp_alloc*/
7521 0, /*tp_new*/
7522 0, /*tp_free*/
7523 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007524};
7525
7526PyObject*
7527PyUnicode_BuildEncodingMap(PyObject* string)
7528{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007529 PyObject *result;
7530 struct encoding_map *mresult;
7531 int i;
7532 int need_dict = 0;
7533 unsigned char level1[32];
7534 unsigned char level2[512];
7535 unsigned char *mlevel1, *mlevel2, *mlevel3;
7536 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007537 int kind;
7538 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007539 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007540 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007541
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007542 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007543 PyErr_BadArgument();
7544 return NULL;
7545 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007546 kind = PyUnicode_KIND(string);
7547 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007548 length = PyUnicode_GET_LENGTH(string);
7549 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007550 memset(level1, 0xFF, sizeof level1);
7551 memset(level2, 0xFF, sizeof level2);
7552
7553 /* If there isn't a one-to-one mapping of NULL to \0,
7554 or if there are non-BMP characters, we need to use
7555 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007556 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007557 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007558 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007559 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007560 ch = PyUnicode_READ(kind, data, i);
7561 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007562 need_dict = 1;
7563 break;
7564 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007565 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007566 /* unmapped character */
7567 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007568 l1 = ch >> 11;
7569 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007570 if (level1[l1] == 0xFF)
7571 level1[l1] = count2++;
7572 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007573 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007574 }
7575
7576 if (count2 >= 0xFF || count3 >= 0xFF)
7577 need_dict = 1;
7578
7579 if (need_dict) {
7580 PyObject *result = PyDict_New();
7581 PyObject *key, *value;
7582 if (!result)
7583 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007584 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007585 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007586 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007587 if (!key || !value)
7588 goto failed1;
7589 if (PyDict_SetItem(result, key, value) == -1)
7590 goto failed1;
7591 Py_DECREF(key);
7592 Py_DECREF(value);
7593 }
7594 return result;
7595 failed1:
7596 Py_XDECREF(key);
7597 Py_XDECREF(value);
7598 Py_DECREF(result);
7599 return NULL;
7600 }
7601
7602 /* Create a three-level trie */
7603 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7604 16*count2 + 128*count3 - 1);
7605 if (!result)
7606 return PyErr_NoMemory();
7607 PyObject_Init(result, &EncodingMapType);
7608 mresult = (struct encoding_map*)result;
7609 mresult->count2 = count2;
7610 mresult->count3 = count3;
7611 mlevel1 = mresult->level1;
7612 mlevel2 = mresult->level23;
7613 mlevel3 = mresult->level23 + 16*count2;
7614 memcpy(mlevel1, level1, 32);
7615 memset(mlevel2, 0xFF, 16*count2);
7616 memset(mlevel3, 0, 128*count3);
7617 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007618 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007619 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007620 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7621 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007622 /* unmapped character */
7623 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007624 o1 = ch>>11;
7625 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007626 i2 = 16*mlevel1[o1] + o2;
7627 if (mlevel2[i2] == 0xFF)
7628 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007629 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007630 i3 = 128*mlevel2[i2] + o3;
7631 mlevel3[i3] = i;
7632 }
7633 return result;
7634}
7635
7636static int
Victor Stinner22168992011-11-20 17:09:18 +01007637encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007638{
7639 struct encoding_map *map = (struct encoding_map*)mapping;
7640 int l1 = c>>11;
7641 int l2 = (c>>7) & 0xF;
7642 int l3 = c & 0x7F;
7643 int i;
7644
Victor Stinner22168992011-11-20 17:09:18 +01007645 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007646 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007647 if (c == 0)
7648 return 0;
7649 /* level 1*/
7650 i = map->level1[l1];
7651 if (i == 0xFF) {
7652 return -1;
7653 }
7654 /* level 2*/
7655 i = map->level23[16*i+l2];
7656 if (i == 0xFF) {
7657 return -1;
7658 }
7659 /* level 3 */
7660 i = map->level23[16*map->count2 + 128*i + l3];
7661 if (i == 0) {
7662 return -1;
7663 }
7664 return i;
7665}
7666
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007667/* Lookup the character ch in the mapping. If the character
7668 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007669 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007670static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007671charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007672{
Christian Heimes217cfd12007-12-02 14:31:20 +00007673 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007674 PyObject *x;
7675
7676 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007677 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007678 x = PyObject_GetItem(mapping, w);
7679 Py_DECREF(w);
7680 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007681 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7682 /* No mapping found means: mapping is undefined. */
7683 PyErr_Clear();
7684 x = Py_None;
7685 Py_INCREF(x);
7686 return x;
7687 } else
7688 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007689 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007690 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007691 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007692 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007693 long value = PyLong_AS_LONG(x);
7694 if (value < 0 || value > 255) {
7695 PyErr_SetString(PyExc_TypeError,
7696 "character mapping must be in range(256)");
7697 Py_DECREF(x);
7698 return NULL;
7699 }
7700 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007701 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007702 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007703 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007704 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007705 /* wrong return value */
7706 PyErr_Format(PyExc_TypeError,
7707 "character mapping must return integer, bytes or None, not %.400s",
7708 x->ob_type->tp_name);
7709 Py_DECREF(x);
7710 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007711 }
7712}
7713
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007714static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007715charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007716{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007717 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7718 /* exponentially overallocate to minimize reallocations */
7719 if (requiredsize < 2*outsize)
7720 requiredsize = 2*outsize;
7721 if (_PyBytes_Resize(outobj, requiredsize))
7722 return -1;
7723 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007724}
7725
Benjamin Peterson14339b62009-01-31 16:36:08 +00007726typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007727 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007728} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007729/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007730 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007731 space is available. Return a new reference to the object that
7732 was put in the output buffer, or Py_None, if the mapping was undefined
7733 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007734 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007735static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01007736charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007737 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007738{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007739 PyObject *rep;
7740 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007741 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007742
Christian Heimes90aa7642007-12-19 02:45:37 +00007743 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007744 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007745 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007746 if (res == -1)
7747 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007748 if (outsize<requiredsize)
7749 if (charmapencode_resize(outobj, outpos, requiredsize))
7750 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007751 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007752 outstart[(*outpos)++] = (char)res;
7753 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007754 }
7755
7756 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007757 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007758 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007759 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007760 Py_DECREF(rep);
7761 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007762 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007763 if (PyLong_Check(rep)) {
7764 Py_ssize_t requiredsize = *outpos+1;
7765 if (outsize<requiredsize)
7766 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7767 Py_DECREF(rep);
7768 return enc_EXCEPTION;
7769 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007770 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007771 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007772 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007773 else {
7774 const char *repchars = PyBytes_AS_STRING(rep);
7775 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7776 Py_ssize_t requiredsize = *outpos+repsize;
7777 if (outsize<requiredsize)
7778 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7779 Py_DECREF(rep);
7780 return enc_EXCEPTION;
7781 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007782 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007783 memcpy(outstart + *outpos, repchars, repsize);
7784 *outpos += repsize;
7785 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007786 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007787 Py_DECREF(rep);
7788 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007789}
7790
7791/* handle an error in PyUnicode_EncodeCharmap
7792 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007793static int
7794charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007795 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007796 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007797 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007798 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007799{
7800 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007801 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007802 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007803 enum PyUnicode_Kind kind;
7804 void *data;
7805 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007806 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007807 Py_ssize_t collstartpos = *inpos;
7808 Py_ssize_t collendpos = *inpos+1;
7809 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007810 char *encoding = "charmap";
7811 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007812 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007813 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05007814 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007815
Benjamin Petersonbac79492012-01-14 13:34:47 -05007816 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007817 return -1;
7818 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007819 /* find all unencodable characters */
7820 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007821 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007822 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007823 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05007824 val = encoding_map_lookup(ch, mapping);
7825 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007826 break;
7827 ++collendpos;
7828 continue;
7829 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007830
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007831 ch = PyUnicode_READ_CHAR(unicode, collendpos);
7832 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007833 if (rep==NULL)
7834 return -1;
7835 else if (rep!=Py_None) {
7836 Py_DECREF(rep);
7837 break;
7838 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007839 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007840 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007841 }
7842 /* cache callback name lookup
7843 * (if not done yet, i.e. it's the first error) */
7844 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007845 if ((errors==NULL) || (!strcmp(errors, "strict")))
7846 *known_errorHandler = 1;
7847 else if (!strcmp(errors, "replace"))
7848 *known_errorHandler = 2;
7849 else if (!strcmp(errors, "ignore"))
7850 *known_errorHandler = 3;
7851 else if (!strcmp(errors, "xmlcharrefreplace"))
7852 *known_errorHandler = 4;
7853 else
7854 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007855 }
7856 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007857 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007858 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007859 return -1;
7860 case 2: /* replace */
7861 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007862 x = charmapencode_output('?', mapping, res, respos);
7863 if (x==enc_EXCEPTION) {
7864 return -1;
7865 }
7866 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007867 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007868 return -1;
7869 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007870 }
7871 /* fall through */
7872 case 3: /* ignore */
7873 *inpos = collendpos;
7874 break;
7875 case 4: /* xmlcharrefreplace */
7876 /* generate replacement (temporarily (mis)uses p) */
7877 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007878 char buffer[2+29+1+1];
7879 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007880 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00007881 for (cp = buffer; *cp; ++cp) {
7882 x = charmapencode_output(*cp, mapping, res, respos);
7883 if (x==enc_EXCEPTION)
7884 return -1;
7885 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007886 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007887 return -1;
7888 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007889 }
7890 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007891 *inpos = collendpos;
7892 break;
7893 default:
7894 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007895 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00007896 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007897 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007898 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007899 if (PyBytes_Check(repunicode)) {
7900 /* Directly copy bytes result to output. */
7901 Py_ssize_t outsize = PyBytes_Size(*res);
7902 Py_ssize_t requiredsize;
7903 repsize = PyBytes_Size(repunicode);
7904 requiredsize = *respos + repsize;
7905 if (requiredsize > outsize)
7906 /* Make room for all additional bytes. */
7907 if (charmapencode_resize(res, respos, requiredsize)) {
7908 Py_DECREF(repunicode);
7909 return -1;
7910 }
7911 memcpy(PyBytes_AsString(*res) + *respos,
7912 PyBytes_AsString(repunicode), repsize);
7913 *respos += repsize;
7914 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007915 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007916 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007917 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007918 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05007919 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007920 Py_DECREF(repunicode);
7921 return -1;
7922 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01007923 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007924 data = PyUnicode_DATA(repunicode);
7925 kind = PyUnicode_KIND(repunicode);
7926 for (index = 0; index < repsize; index++) {
7927 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
7928 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00007929 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007930 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00007931 return -1;
7932 }
7933 else if (x==enc_FAILED) {
7934 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007935 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007936 return -1;
7937 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007938 }
7939 *inpos = newpos;
7940 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007941 }
7942 return 0;
7943}
7944
Alexander Belopolsky40018472011-02-26 01:02:56 +00007945PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007946_PyUnicode_EncodeCharmap(PyObject *unicode,
7947 PyObject *mapping,
7948 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007949{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007950 /* output object */
7951 PyObject *res = NULL;
7952 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007953 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007954 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007955 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007956 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007957 PyObject *errorHandler = NULL;
7958 PyObject *exc = NULL;
7959 /* the following variable is used for caching string comparisons
7960 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7961 * 3=ignore, 4=xmlcharrefreplace */
7962 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007963
Benjamin Petersonbac79492012-01-14 13:34:47 -05007964 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007965 return NULL;
7966 size = PyUnicode_GET_LENGTH(unicode);
7967
Guido van Rossumd57fd912000-03-10 22:53:23 +00007968 /* Default to Latin-1 */
7969 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007970 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007971
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007972 /* allocate enough for a simple encoding without
7973 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007974 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007975 if (res == NULL)
7976 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007977 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007978 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007979
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007980 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007981 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00007982 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007983 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00007984 if (x==enc_EXCEPTION) /* error */
7985 goto onError;
7986 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007987 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00007988 &exc,
7989 &known_errorHandler, &errorHandler, errors,
7990 &res, &respos)) {
7991 goto onError;
7992 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007993 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007994 else
7995 /* done with this character => adjust input position */
7996 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007997 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007998
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007999 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008000 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008001 if (_PyBytes_Resize(&res, respos) < 0)
8002 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008003
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008004 Py_XDECREF(exc);
8005 Py_XDECREF(errorHandler);
8006 return res;
8007
Benjamin Peterson29060642009-01-31 22:14:21 +00008008 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008009 Py_XDECREF(res);
8010 Py_XDECREF(exc);
8011 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008012 return NULL;
8013}
8014
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008015/* Deprecated */
8016PyObject *
8017PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8018 Py_ssize_t size,
8019 PyObject *mapping,
8020 const char *errors)
8021{
8022 PyObject *result;
8023 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8024 if (unicode == NULL)
8025 return NULL;
8026 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8027 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008028 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008029}
8030
Alexander Belopolsky40018472011-02-26 01:02:56 +00008031PyObject *
8032PyUnicode_AsCharmapString(PyObject *unicode,
8033 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008034{
8035 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008036 PyErr_BadArgument();
8037 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008038 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008039 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008040}
8041
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008042/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008043static void
8044make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008045 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008046 Py_ssize_t startpos, Py_ssize_t endpos,
8047 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008048{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008049 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008050 *exceptionObject = _PyUnicodeTranslateError_Create(
8051 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008052 }
8053 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008054 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8055 goto onError;
8056 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8057 goto onError;
8058 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8059 goto onError;
8060 return;
8061 onError:
8062 Py_DECREF(*exceptionObject);
8063 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008064 }
8065}
8066
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008067/* error handling callback helper:
8068 build arguments, call the callback and check the arguments,
8069 put the result into newpos and return the replacement string, which
8070 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008071static PyObject *
8072unicode_translate_call_errorhandler(const char *errors,
8073 PyObject **errorHandler,
8074 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008075 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008076 Py_ssize_t startpos, Py_ssize_t endpos,
8077 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008078{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008079 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008080
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008081 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008082 PyObject *restuple;
8083 PyObject *resunicode;
8084
8085 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008086 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008087 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008088 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008089 }
8090
8091 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008092 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008093 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008094 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008095
8096 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008097 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008098 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008099 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008100 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008101 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008102 Py_DECREF(restuple);
8103 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008104 }
8105 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008106 &resunicode, &i_newpos)) {
8107 Py_DECREF(restuple);
8108 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008109 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008110 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008111 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008112 else
8113 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008114 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008115 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8116 Py_DECREF(restuple);
8117 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008118 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008119 Py_INCREF(resunicode);
8120 Py_DECREF(restuple);
8121 return resunicode;
8122}
8123
8124/* Lookup the character ch in the mapping and put the result in result,
8125 which must be decrefed by the caller.
8126 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008127static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008128charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008129{
Christian Heimes217cfd12007-12-02 14:31:20 +00008130 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008131 PyObject *x;
8132
8133 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008134 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008135 x = PyObject_GetItem(mapping, w);
8136 Py_DECREF(w);
8137 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008138 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8139 /* No mapping found means: use 1:1 mapping. */
8140 PyErr_Clear();
8141 *result = NULL;
8142 return 0;
8143 } else
8144 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008145 }
8146 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008147 *result = x;
8148 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008149 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008150 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008151 long value = PyLong_AS_LONG(x);
8152 long max = PyUnicode_GetMax();
8153 if (value < 0 || value > max) {
8154 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008155 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008156 Py_DECREF(x);
8157 return -1;
8158 }
8159 *result = x;
8160 return 0;
8161 }
8162 else if (PyUnicode_Check(x)) {
8163 *result = x;
8164 return 0;
8165 }
8166 else {
8167 /* wrong return value */
8168 PyErr_SetString(PyExc_TypeError,
8169 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008170 Py_DECREF(x);
8171 return -1;
8172 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008173}
8174/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008175 if not reallocate and adjust various state variables.
8176 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008177static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008178charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008179 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008180{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008181 Py_ssize_t oldsize = *psize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008182 Py_UCS4 *new_outobj;
Walter Dörwald4894c302003-10-24 14:25:28 +00008183 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008184 /* exponentially overallocate to minimize reallocations */
8185 if (requiredsize < 2 * oldsize)
8186 requiredsize = 2 * oldsize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008187 new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8188 if (new_outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008189 return -1;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008190 *outobj = new_outobj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008191 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008192 }
8193 return 0;
8194}
8195/* lookup the character, put the result in the output string and adjust
8196 various state variables. Return a new reference to the object that
8197 was put in the output buffer in *result, or Py_None, if the mapping was
8198 undefined (in which case no character was written).
8199 The called must decref result.
8200 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008201static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008202charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8203 PyObject *mapping, Py_UCS4 **output,
8204 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008205 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008206{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008207 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8208 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008209 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008210 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008211 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008212 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008213 }
8214 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008215 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008216 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008217 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008218 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008219 }
8220 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008221 Py_ssize_t repsize;
8222 if (PyUnicode_READY(*res) == -1)
8223 return -1;
8224 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008225 if (repsize==1) {
8226 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008227 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008228 }
8229 else if (repsize!=0) {
8230 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008231 Py_ssize_t requiredsize = *opos +
8232 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008233 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008234 Py_ssize_t i;
8235 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008236 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008237 for(i = 0; i < repsize; i++)
8238 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008239 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008240 }
8241 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008242 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008243 return 0;
8244}
8245
Alexander Belopolsky40018472011-02-26 01:02:56 +00008246PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008247_PyUnicode_TranslateCharmap(PyObject *input,
8248 PyObject *mapping,
8249 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008250{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008251 /* input object */
8252 char *idata;
8253 Py_ssize_t size, i;
8254 int kind;
8255 /* output buffer */
8256 Py_UCS4 *output = NULL;
8257 Py_ssize_t osize;
8258 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008259 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008260 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008261 char *reason = "character maps to <undefined>";
8262 PyObject *errorHandler = NULL;
8263 PyObject *exc = NULL;
8264 /* the following variable is used for caching string comparisons
8265 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8266 * 3=ignore, 4=xmlcharrefreplace */
8267 int known_errorHandler = -1;
8268
Guido van Rossumd57fd912000-03-10 22:53:23 +00008269 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008270 PyErr_BadArgument();
8271 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008272 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008273
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008274 if (PyUnicode_READY(input) == -1)
8275 return NULL;
8276 idata = (char*)PyUnicode_DATA(input);
8277 kind = PyUnicode_KIND(input);
8278 size = PyUnicode_GET_LENGTH(input);
8279 i = 0;
8280
8281 if (size == 0) {
8282 Py_INCREF(input);
8283 return input;
8284 }
8285
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008286 /* allocate enough for a simple 1:1 translation without
8287 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008288 osize = size;
8289 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8290 opos = 0;
8291 if (output == NULL) {
8292 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008293 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008294 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008295
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008296 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008297 /* try to encode it */
8298 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008299 if (charmaptranslate_output(input, i, mapping,
8300 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008301 Py_XDECREF(x);
8302 goto onError;
8303 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008304 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008305 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008306 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008307 else { /* untranslatable character */
8308 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8309 Py_ssize_t repsize;
8310 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008311 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008312 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008313 Py_ssize_t collstart = i;
8314 Py_ssize_t collend = i+1;
8315 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008316
Benjamin Peterson29060642009-01-31 22:14:21 +00008317 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008318 while (collend < size) {
8319 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008320 goto onError;
8321 Py_XDECREF(x);
8322 if (x!=Py_None)
8323 break;
8324 ++collend;
8325 }
8326 /* cache callback name lookup
8327 * (if not done yet, i.e. it's the first error) */
8328 if (known_errorHandler==-1) {
8329 if ((errors==NULL) || (!strcmp(errors, "strict")))
8330 known_errorHandler = 1;
8331 else if (!strcmp(errors, "replace"))
8332 known_errorHandler = 2;
8333 else if (!strcmp(errors, "ignore"))
8334 known_errorHandler = 3;
8335 else if (!strcmp(errors, "xmlcharrefreplace"))
8336 known_errorHandler = 4;
8337 else
8338 known_errorHandler = 0;
8339 }
8340 switch (known_errorHandler) {
8341 case 1: /* strict */
Victor Stinner6fa62752012-10-23 02:51:50 +02008342 make_translate_exception(&exc,
8343 input, collstart, collend, reason);
8344 if (exc != NULL)
8345 PyCodec_StrictErrors(exc);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008346 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008347 case 2: /* replace */
8348 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008349 for (coll = collstart; coll<collend; coll++)
8350 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008351 /* fall through */
8352 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008353 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008354 break;
8355 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008356 /* generate replacement (temporarily (mis)uses i) */
8357 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008358 char buffer[2+29+1+1];
8359 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008360 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8361 if (charmaptranslate_makespace(&output, &osize,
8362 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008363 goto onError;
8364 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008365 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008366 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008367 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008368 break;
8369 default:
8370 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008371 reason, input, &exc,
8372 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008373 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008374 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008375 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008376 Py_DECREF(repunicode);
8377 goto onError;
8378 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008379 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008380 repsize = PyUnicode_GET_LENGTH(repunicode);
8381 if (charmaptranslate_makespace(&output, &osize,
8382 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008383 Py_DECREF(repunicode);
8384 goto onError;
8385 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008386 for (uni2 = 0; repsize-->0; ++uni2)
8387 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8388 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008389 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008390 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008391 }
8392 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008393 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8394 if (!res)
8395 goto onError;
8396 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008397 Py_XDECREF(exc);
8398 Py_XDECREF(errorHandler);
8399 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008400
Benjamin Peterson29060642009-01-31 22:14:21 +00008401 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008402 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008403 Py_XDECREF(exc);
8404 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008405 return NULL;
8406}
8407
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008408/* Deprecated. Use PyUnicode_Translate instead. */
8409PyObject *
8410PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8411 Py_ssize_t size,
8412 PyObject *mapping,
8413 const char *errors)
8414{
Christian Heimes5f520f42012-09-11 14:03:25 +02008415 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008416 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8417 if (!unicode)
8418 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008419 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8420 Py_DECREF(unicode);
8421 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008422}
8423
Alexander Belopolsky40018472011-02-26 01:02:56 +00008424PyObject *
8425PyUnicode_Translate(PyObject *str,
8426 PyObject *mapping,
8427 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008428{
8429 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008430
Guido van Rossumd57fd912000-03-10 22:53:23 +00008431 str = PyUnicode_FromObject(str);
8432 if (str == NULL)
Christian Heimes5f520f42012-09-11 14:03:25 +02008433 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008434 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008435 Py_DECREF(str);
8436 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008437}
Tim Petersced69f82003-09-16 20:30:58 +00008438
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008439static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008440fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008441{
8442 /* No need to call PyUnicode_READY(self) because this function is only
8443 called as a callback from fixup() which does it already. */
8444 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8445 const int kind = PyUnicode_KIND(self);
8446 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008447 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008448 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008449 Py_ssize_t i;
8450
8451 for (i = 0; i < len; ++i) {
8452 ch = PyUnicode_READ(kind, data, i);
8453 fixed = 0;
8454 if (ch > 127) {
8455 if (Py_UNICODE_ISSPACE(ch))
8456 fixed = ' ';
8457 else {
8458 const int decimal = Py_UNICODE_TODECIMAL(ch);
8459 if (decimal >= 0)
8460 fixed = '0' + decimal;
8461 }
8462 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008463 modified = 1;
Victor Stinnere6abb482012-05-02 01:15:40 +02008464 maxchar = MAX_MAXCHAR(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008465 PyUnicode_WRITE(kind, data, i, fixed);
8466 }
Victor Stinnere6abb482012-05-02 01:15:40 +02008467 else
8468 maxchar = MAX_MAXCHAR(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008469 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008470 }
8471
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008472 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008473}
8474
8475PyObject *
8476_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8477{
8478 if (!PyUnicode_Check(unicode)) {
8479 PyErr_BadInternalCall();
8480 return NULL;
8481 }
8482 if (PyUnicode_READY(unicode) == -1)
8483 return NULL;
8484 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8485 /* If the string is already ASCII, just return the same string */
8486 Py_INCREF(unicode);
8487 return unicode;
8488 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008489 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008490}
8491
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008492PyObject *
8493PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8494 Py_ssize_t length)
8495{
Victor Stinnerf0124502011-11-21 23:12:56 +01008496 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008497 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008498 Py_UCS4 maxchar;
8499 enum PyUnicode_Kind kind;
8500 void *data;
8501
Victor Stinner99d7ad02012-02-22 13:37:39 +01008502 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008503 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008504 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008505 if (ch > 127) {
8506 int decimal = Py_UNICODE_TODECIMAL(ch);
8507 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008508 ch = '0' + decimal;
Victor Stinnere6abb482012-05-02 01:15:40 +02008509 maxchar = MAX_MAXCHAR(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008510 }
8511 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008512
8513 /* Copy to a new string */
8514 decimal = PyUnicode_New(length, maxchar);
8515 if (decimal == NULL)
8516 return decimal;
8517 kind = PyUnicode_KIND(decimal);
8518 data = PyUnicode_DATA(decimal);
8519 /* Iterate over code points */
8520 for (i = 0; i < length; i++) {
8521 Py_UNICODE ch = s[i];
8522 if (ch > 127) {
8523 int decimal = Py_UNICODE_TODECIMAL(ch);
8524 if (decimal >= 0)
8525 ch = '0' + decimal;
8526 }
8527 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008528 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008529 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008530}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008531/* --- Decimal Encoder ---------------------------------------------------- */
8532
Alexander Belopolsky40018472011-02-26 01:02:56 +00008533int
8534PyUnicode_EncodeDecimal(Py_UNICODE *s,
8535 Py_ssize_t length,
8536 char *output,
8537 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008538{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008539 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008540 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008541 enum PyUnicode_Kind kind;
8542 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008543
8544 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008545 PyErr_BadArgument();
8546 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008547 }
8548
Victor Stinner42bf7752011-11-21 22:52:58 +01008549 unicode = PyUnicode_FromUnicode(s, length);
8550 if (unicode == NULL)
8551 return -1;
8552
Benjamin Petersonbac79492012-01-14 13:34:47 -05008553 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008554 Py_DECREF(unicode);
8555 return -1;
8556 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008557 kind = PyUnicode_KIND(unicode);
8558 data = PyUnicode_DATA(unicode);
8559
Victor Stinnerb84d7232011-11-22 01:50:07 +01008560 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008561 PyObject *exc;
8562 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008563 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008564 Py_ssize_t startpos;
8565
8566 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008567
Benjamin Peterson29060642009-01-31 22:14:21 +00008568 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008569 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008570 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008571 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008572 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008573 decimal = Py_UNICODE_TODECIMAL(ch);
8574 if (decimal >= 0) {
8575 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008576 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008577 continue;
8578 }
8579 if (0 < ch && ch < 256) {
8580 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008581 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008582 continue;
8583 }
Victor Stinner6345be92011-11-25 20:09:01 +01008584
Victor Stinner42bf7752011-11-21 22:52:58 +01008585 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008586 exc = NULL;
8587 raise_encode_exception(&exc, "decimal", unicode,
8588 startpos, startpos+1,
8589 "invalid decimal Unicode string");
8590 Py_XDECREF(exc);
8591 Py_DECREF(unicode);
8592 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008593 }
8594 /* 0-terminate the output string */
8595 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008596 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008597 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008598}
8599
Guido van Rossumd57fd912000-03-10 22:53:23 +00008600/* --- Helpers ------------------------------------------------------------ */
8601
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008602static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008603any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008604 Py_ssize_t start,
8605 Py_ssize_t end)
8606{
8607 int kind1, kind2, kind;
8608 void *buf1, *buf2;
8609 Py_ssize_t len1, len2, result;
8610
8611 kind1 = PyUnicode_KIND(s1);
8612 kind2 = PyUnicode_KIND(s2);
8613 kind = kind1 > kind2 ? kind1 : kind2;
8614 buf1 = PyUnicode_DATA(s1);
8615 buf2 = PyUnicode_DATA(s2);
8616 if (kind1 != kind)
8617 buf1 = _PyUnicode_AsKind(s1, kind);
8618 if (!buf1)
8619 return -2;
8620 if (kind2 != kind)
8621 buf2 = _PyUnicode_AsKind(s2, kind);
8622 if (!buf2) {
8623 if (kind1 != kind) PyMem_Free(buf1);
8624 return -2;
8625 }
8626 len1 = PyUnicode_GET_LENGTH(s1);
8627 len2 = PyUnicode_GET_LENGTH(s2);
8628
Victor Stinner794d5672011-10-10 03:21:36 +02008629 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008630 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008631 case PyUnicode_1BYTE_KIND:
8632 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8633 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8634 else
8635 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8636 break;
8637 case PyUnicode_2BYTE_KIND:
8638 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8639 break;
8640 case PyUnicode_4BYTE_KIND:
8641 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8642 break;
8643 default:
8644 assert(0); result = -2;
8645 }
8646 }
8647 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008648 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008649 case PyUnicode_1BYTE_KIND:
8650 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8651 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8652 else
8653 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8654 break;
8655 case PyUnicode_2BYTE_KIND:
8656 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8657 break;
8658 case PyUnicode_4BYTE_KIND:
8659 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8660 break;
8661 default:
8662 assert(0); result = -2;
8663 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008664 }
8665
8666 if (kind1 != kind)
8667 PyMem_Free(buf1);
8668 if (kind2 != kind)
8669 PyMem_Free(buf2);
8670
8671 return result;
8672}
8673
8674Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01008675_PyUnicode_InsertThousandsGrouping(
8676 PyObject *unicode, Py_ssize_t index,
8677 Py_ssize_t n_buffer,
8678 void *digits, Py_ssize_t n_digits,
8679 Py_ssize_t min_width,
8680 const char *grouping, PyObject *thousands_sep,
8681 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008682{
Victor Stinner41a863c2012-02-24 00:37:51 +01008683 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008684 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01008685 Py_ssize_t thousands_sep_len;
8686 Py_ssize_t len;
8687
8688 if (unicode != NULL) {
8689 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008690 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01008691 }
8692 else {
8693 kind = PyUnicode_1BYTE_KIND;
8694 data = NULL;
8695 }
8696 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
8697 thousands_sep_data = PyUnicode_DATA(thousands_sep);
8698 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
8699 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01008700 if (thousands_sep_kind < kind) {
8701 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
8702 if (!thousands_sep_data)
8703 return -1;
8704 }
8705 else {
8706 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
8707 if (!data)
8708 return -1;
8709 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008710 }
8711
Benjamin Petersonead6b532011-12-20 17:23:42 -06008712 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008713 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008714 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01008715 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008716 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008717 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008718 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02008719 else
Victor Stinner41a863c2012-02-24 00:37:51 +01008720 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02008721 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008722 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008723 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008724 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008725 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008726 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008727 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008728 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008729 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008730 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008731 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008732 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008733 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008734 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008735 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008736 break;
8737 default:
8738 assert(0);
8739 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008740 }
Victor Stinner90f50d42012-02-24 01:44:47 +01008741 if (unicode != NULL && thousands_sep_kind != kind) {
8742 if (thousands_sep_kind < kind)
8743 PyMem_Free(thousands_sep_data);
8744 else
8745 PyMem_Free(data);
8746 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008747 if (unicode == NULL) {
8748 *maxchar = 127;
8749 if (len != n_digits) {
Victor Stinnere6abb482012-05-02 01:15:40 +02008750 *maxchar = MAX_MAXCHAR(*maxchar,
8751 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01008752 }
8753 }
8754 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008755}
8756
8757
Thomas Wouters477c8d52006-05-27 19:21:47 +00008758/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008759#define ADJUST_INDICES(start, end, len) \
8760 if (end > len) \
8761 end = len; \
8762 else if (end < 0) { \
8763 end += len; \
8764 if (end < 0) \
8765 end = 0; \
8766 } \
8767 if (start < 0) { \
8768 start += len; \
8769 if (start < 0) \
8770 start = 0; \
8771 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008772
Alexander Belopolsky40018472011-02-26 01:02:56 +00008773Py_ssize_t
8774PyUnicode_Count(PyObject *str,
8775 PyObject *substr,
8776 Py_ssize_t start,
8777 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008778{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008779 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008780 PyObject* str_obj;
8781 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008782 int kind1, kind2, kind;
8783 void *buf1 = NULL, *buf2 = NULL;
8784 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008785
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008786 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008787 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008788 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008789 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008790 if (!sub_obj) {
8791 Py_DECREF(str_obj);
8792 return -1;
8793 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06008794 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06008795 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008796 Py_DECREF(str_obj);
8797 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008798 }
Tim Petersced69f82003-09-16 20:30:58 +00008799
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008800 kind1 = PyUnicode_KIND(str_obj);
8801 kind2 = PyUnicode_KIND(sub_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008802 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008803 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008804 buf2 = PyUnicode_DATA(sub_obj);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008805 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +02008806 if (kind2 > kind) {
8807 Py_DECREF(sub_obj);
8808 Py_DECREF(str_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008809 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +02008810 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01008811 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008812 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008813 if (!buf2)
8814 goto onError;
8815 len1 = PyUnicode_GET_LENGTH(str_obj);
8816 len2 = PyUnicode_GET_LENGTH(sub_obj);
8817
8818 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06008819 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008820 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008821 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
8822 result = asciilib_count(
8823 ((Py_UCS1*)buf1) + start, end - start,
8824 buf2, len2, PY_SSIZE_T_MAX
8825 );
8826 else
8827 result = ucs1lib_count(
8828 ((Py_UCS1*)buf1) + start, end - start,
8829 buf2, len2, PY_SSIZE_T_MAX
8830 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008831 break;
8832 case PyUnicode_2BYTE_KIND:
8833 result = ucs2lib_count(
8834 ((Py_UCS2*)buf1) + start, end - start,
8835 buf2, len2, PY_SSIZE_T_MAX
8836 );
8837 break;
8838 case PyUnicode_4BYTE_KIND:
8839 result = ucs4lib_count(
8840 ((Py_UCS4*)buf1) + start, end - start,
8841 buf2, len2, PY_SSIZE_T_MAX
8842 );
8843 break;
8844 default:
8845 assert(0); result = 0;
8846 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008847
8848 Py_DECREF(sub_obj);
8849 Py_DECREF(str_obj);
8850
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008851 if (kind2 != kind)
8852 PyMem_Free(buf2);
8853
Guido van Rossumd57fd912000-03-10 22:53:23 +00008854 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008855 onError:
8856 Py_DECREF(sub_obj);
8857 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008858 if (kind2 != kind && buf2)
8859 PyMem_Free(buf2);
8860 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008861}
8862
Alexander Belopolsky40018472011-02-26 01:02:56 +00008863Py_ssize_t
8864PyUnicode_Find(PyObject *str,
8865 PyObject *sub,
8866 Py_ssize_t start,
8867 Py_ssize_t end,
8868 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008869{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008870 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008871
Guido van Rossumd57fd912000-03-10 22:53:23 +00008872 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008873 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00008874 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008875 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008876 if (!sub) {
8877 Py_DECREF(str);
8878 return -2;
8879 }
8880 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
8881 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00008882 Py_DECREF(str);
8883 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008884 }
Tim Petersced69f82003-09-16 20:30:58 +00008885
Victor Stinner794d5672011-10-10 03:21:36 +02008886 result = any_find_slice(direction,
8887 str, sub, start, end
8888 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00008889
Guido van Rossumd57fd912000-03-10 22:53:23 +00008890 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008891 Py_DECREF(sub);
8892
Guido van Rossumd57fd912000-03-10 22:53:23 +00008893 return result;
8894}
8895
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008896Py_ssize_t
8897PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8898 Py_ssize_t start, Py_ssize_t end,
8899 int direction)
8900{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008901 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02008902 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008903 if (PyUnicode_READY(str) == -1)
8904 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008905 if (start < 0 || end < 0) {
8906 PyErr_SetString(PyExc_IndexError, "string index out of range");
8907 return -2;
8908 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008909 if (end > PyUnicode_GET_LENGTH(str))
8910 end = PyUnicode_GET_LENGTH(str);
8911 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02008912 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
8913 kind, end-start, ch, direction);
8914 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008915 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02008916 else
8917 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008918}
8919
Alexander Belopolsky40018472011-02-26 01:02:56 +00008920static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008921tailmatch(PyObject *self,
8922 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008923 Py_ssize_t start,
8924 Py_ssize_t end,
8925 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008926{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008927 int kind_self;
8928 int kind_sub;
8929 void *data_self;
8930 void *data_sub;
8931 Py_ssize_t offset;
8932 Py_ssize_t i;
8933 Py_ssize_t end_sub;
8934
8935 if (PyUnicode_READY(self) == -1 ||
8936 PyUnicode_READY(substring) == -1)
8937 return 0;
8938
8939 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008940 return 1;
8941
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008942 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8943 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008944 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008945 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008946
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008947 kind_self = PyUnicode_KIND(self);
8948 data_self = PyUnicode_DATA(self);
8949 kind_sub = PyUnicode_KIND(substring);
8950 data_sub = PyUnicode_DATA(substring);
8951 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8952
8953 if (direction > 0)
8954 offset = end;
8955 else
8956 offset = start;
8957
8958 if (PyUnicode_READ(kind_self, data_self, offset) ==
8959 PyUnicode_READ(kind_sub, data_sub, 0) &&
8960 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8961 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8962 /* If both are of the same kind, memcmp is sufficient */
8963 if (kind_self == kind_sub) {
8964 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02008965 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008966 data_sub,
8967 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02008968 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008969 }
8970 /* otherwise we have to compare each character by first accesing it */
8971 else {
8972 /* We do not need to compare 0 and len(substring)-1 because
8973 the if statement above ensured already that they are equal
8974 when we end up here. */
Antoine Pitrou057119b2012-09-02 17:56:33 +02008975 /* TODO: honor direction and do a forward or backwards search */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008976 for (i = 1; i < end_sub; ++i) {
8977 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8978 PyUnicode_READ(kind_sub, data_sub, i))
8979 return 0;
8980 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008981 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008982 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008983 }
8984
8985 return 0;
8986}
8987
Alexander Belopolsky40018472011-02-26 01:02:56 +00008988Py_ssize_t
8989PyUnicode_Tailmatch(PyObject *str,
8990 PyObject *substr,
8991 Py_ssize_t start,
8992 Py_ssize_t end,
8993 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008994{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008995 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008996
Guido van Rossumd57fd912000-03-10 22:53:23 +00008997 str = PyUnicode_FromObject(str);
8998 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008999 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009000 substr = PyUnicode_FromObject(substr);
9001 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009002 Py_DECREF(str);
9003 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009004 }
Tim Petersced69f82003-09-16 20:30:58 +00009005
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009006 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009007 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009008 Py_DECREF(str);
9009 Py_DECREF(substr);
9010 return result;
9011}
9012
Guido van Rossumd57fd912000-03-10 22:53:23 +00009013/* Apply fixfct filter to the Unicode object self and return a
9014 reference to the modified object */
9015
Alexander Belopolsky40018472011-02-26 01:02:56 +00009016static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009017fixup(PyObject *self,
9018 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009019{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009020 PyObject *u;
9021 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009022 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009023
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009024 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009025 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009026 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009027 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009028
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009029 /* fix functions return the new maximum character in a string,
9030 if the kind of the resulting unicode object does not change,
9031 everything is fine. Otherwise we need to change the string kind
9032 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009033 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009034
9035 if (maxchar_new == 0) {
9036 /* no changes */;
9037 if (PyUnicode_CheckExact(self)) {
9038 Py_DECREF(u);
9039 Py_INCREF(self);
9040 return self;
9041 }
9042 else
9043 return u;
9044 }
9045
Victor Stinnere6abb482012-05-02 01:15:40 +02009046 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009047
Victor Stinnereaab6042011-12-11 22:22:39 +01009048 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009049 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009050
9051 /* In case the maximum character changed, we need to
9052 convert the string to the new category. */
9053 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9054 if (v == NULL) {
9055 Py_DECREF(u);
9056 return NULL;
9057 }
9058 if (maxchar_new > maxchar_old) {
9059 /* If the maxchar increased so that the kind changed, not all
9060 characters are representable anymore and we need to fix the
9061 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009062 _PyUnicode_FastCopyCharacters(v, 0,
9063 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009064 maxchar_old = fixfct(v);
9065 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009066 }
9067 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009068 _PyUnicode_FastCopyCharacters(v, 0,
9069 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009070 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009071 Py_DECREF(u);
9072 assert(_PyUnicode_CheckConsistency(v, 1));
9073 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009074}
9075
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009076static PyObject *
9077ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009078{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009079 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9080 char *resdata, *data = PyUnicode_DATA(self);
9081 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009082
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009083 res = PyUnicode_New(len, 127);
9084 if (res == NULL)
9085 return NULL;
9086 resdata = PyUnicode_DATA(res);
9087 if (lower)
9088 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009089 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009090 _Py_bytes_upper(resdata, data, len);
9091 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009092}
9093
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009094static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009095handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009096{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009097 Py_ssize_t j;
9098 int final_sigma;
9099 Py_UCS4 c;
9100 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009101
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009102 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9103
9104 where ! is a negation and \p{xxx} is a character with property xxx.
9105 */
9106 for (j = i - 1; j >= 0; j--) {
9107 c = PyUnicode_READ(kind, data, j);
9108 if (!_PyUnicode_IsCaseIgnorable(c))
9109 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009110 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009111 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9112 if (final_sigma) {
9113 for (j = i + 1; j < length; j++) {
9114 c = PyUnicode_READ(kind, data, j);
9115 if (!_PyUnicode_IsCaseIgnorable(c))
9116 break;
9117 }
9118 final_sigma = j == length || !_PyUnicode_IsCased(c);
9119 }
9120 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009121}
9122
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009123static int
9124lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9125 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009126{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009127 /* Obscure special case. */
9128 if (c == 0x3A3) {
9129 mapped[0] = handle_capital_sigma(kind, data, length, i);
9130 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009131 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009132 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009133}
9134
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009135static Py_ssize_t
9136do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009137{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009138 Py_ssize_t i, k = 0;
9139 int n_res, j;
9140 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009141
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009142 c = PyUnicode_READ(kind, data, 0);
9143 n_res = _PyUnicode_ToUpperFull(c, mapped);
9144 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009145 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009146 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009147 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009148 for (i = 1; i < length; i++) {
9149 c = PyUnicode_READ(kind, data, i);
9150 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9151 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009152 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009153 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009154 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009155 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009156 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009157}
9158
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009159static Py_ssize_t
9160do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9161 Py_ssize_t i, k = 0;
9162
9163 for (i = 0; i < length; i++) {
9164 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9165 int n_res, j;
9166 if (Py_UNICODE_ISUPPER(c)) {
9167 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9168 }
9169 else if (Py_UNICODE_ISLOWER(c)) {
9170 n_res = _PyUnicode_ToUpperFull(c, mapped);
9171 }
9172 else {
9173 n_res = 1;
9174 mapped[0] = c;
9175 }
9176 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009177 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009178 res[k++] = mapped[j];
9179 }
9180 }
9181 return k;
9182}
9183
9184static Py_ssize_t
9185do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9186 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009187{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009188 Py_ssize_t i, k = 0;
9189
9190 for (i = 0; i < length; i++) {
9191 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9192 int n_res, j;
9193 if (lower)
9194 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9195 else
9196 n_res = _PyUnicode_ToUpperFull(c, mapped);
9197 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009198 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009199 res[k++] = mapped[j];
9200 }
9201 }
9202 return k;
9203}
9204
9205static Py_ssize_t
9206do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9207{
9208 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9209}
9210
9211static Py_ssize_t
9212do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9213{
9214 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9215}
9216
Benjamin Petersone51757f2012-01-12 21:10:29 -05009217static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009218do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9219{
9220 Py_ssize_t i, k = 0;
9221
9222 for (i = 0; i < length; i++) {
9223 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9224 Py_UCS4 mapped[3];
9225 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9226 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009227 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009228 res[k++] = mapped[j];
9229 }
9230 }
9231 return k;
9232}
9233
9234static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009235do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9236{
9237 Py_ssize_t i, k = 0;
9238 int previous_is_cased;
9239
9240 previous_is_cased = 0;
9241 for (i = 0; i < length; i++) {
9242 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9243 Py_UCS4 mapped[3];
9244 int n_res, j;
9245
9246 if (previous_is_cased)
9247 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9248 else
9249 n_res = _PyUnicode_ToTitleFull(c, mapped);
9250
9251 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009252 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009253 res[k++] = mapped[j];
9254 }
9255
9256 previous_is_cased = _PyUnicode_IsCased(c);
9257 }
9258 return k;
9259}
9260
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009261static PyObject *
9262case_operation(PyObject *self,
9263 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9264{
9265 PyObject *res = NULL;
9266 Py_ssize_t length, newlength = 0;
9267 int kind, outkind;
9268 void *data, *outdata;
9269 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9270
Benjamin Petersoneea48462012-01-16 14:28:50 -05009271 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009272
9273 kind = PyUnicode_KIND(self);
9274 data = PyUnicode_DATA(self);
9275 length = PyUnicode_GET_LENGTH(self);
9276 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9277 if (tmp == NULL)
9278 return PyErr_NoMemory();
9279 newlength = perform(kind, data, length, tmp, &maxchar);
9280 res = PyUnicode_New(newlength, maxchar);
9281 if (res == NULL)
9282 goto leave;
9283 tmpend = tmp + newlength;
9284 outdata = PyUnicode_DATA(res);
9285 outkind = PyUnicode_KIND(res);
9286 switch (outkind) {
9287 case PyUnicode_1BYTE_KIND:
9288 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9289 break;
9290 case PyUnicode_2BYTE_KIND:
9291 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9292 break;
9293 case PyUnicode_4BYTE_KIND:
9294 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9295 break;
9296 default:
9297 assert(0);
9298 break;
9299 }
9300 leave:
9301 PyMem_FREE(tmp);
9302 return res;
9303}
9304
Tim Peters8ce9f162004-08-27 01:49:32 +00009305PyObject *
9306PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009307{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009308 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009309 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009310 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009311 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009312 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9313 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009314 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009315 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009316 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009317 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009318 int use_memcpy;
9319 unsigned char *res_data = NULL, *sep_data = NULL;
9320 PyObject *last_obj;
9321 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009322
Tim Peters05eba1f2004-08-27 21:32:02 +00009323 fseq = PySequence_Fast(seq, "");
9324 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009325 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009326 }
9327
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009328 /* NOTE: the following code can't call back into Python code,
9329 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009330 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009331
Tim Peters05eba1f2004-08-27 21:32:02 +00009332 seqlen = PySequence_Fast_GET_SIZE(fseq);
9333 /* If empty sequence, return u"". */
9334 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009335 Py_DECREF(fseq);
9336 Py_INCREF(unicode_empty);
9337 res = unicode_empty;
9338 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009339 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009340
Tim Peters05eba1f2004-08-27 21:32:02 +00009341 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009342 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009343 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009344 if (seqlen == 1) {
9345 if (PyUnicode_CheckExact(items[0])) {
9346 res = items[0];
9347 Py_INCREF(res);
9348 Py_DECREF(fseq);
9349 return res;
9350 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009351 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009352 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009353 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009354 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009355 /* Set up sep and seplen */
9356 if (separator == NULL) {
9357 /* fall back to a blank space separator */
9358 sep = PyUnicode_FromOrdinal(' ');
9359 if (!sep)
9360 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009361 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009362 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009363 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009364 else {
9365 if (!PyUnicode_Check(separator)) {
9366 PyErr_Format(PyExc_TypeError,
9367 "separator: expected str instance,"
9368 " %.80s found",
9369 Py_TYPE(separator)->tp_name);
9370 goto onError;
9371 }
9372 if (PyUnicode_READY(separator))
9373 goto onError;
9374 sep = separator;
9375 seplen = PyUnicode_GET_LENGTH(separator);
9376 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9377 /* inc refcount to keep this code path symmetric with the
9378 above case of a blank separator */
9379 Py_INCREF(sep);
9380 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009381 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009382 }
9383
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009384 /* There are at least two things to join, or else we have a subclass
9385 * of str in the sequence.
9386 * Do a pre-pass to figure out the total amount of space we'll
9387 * need (sz), and see whether all argument are strings.
9388 */
9389 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009390#ifdef Py_DEBUG
9391 use_memcpy = 0;
9392#else
9393 use_memcpy = 1;
9394#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009395 for (i = 0; i < seqlen; i++) {
9396 const Py_ssize_t old_sz = sz;
9397 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009398 if (!PyUnicode_Check(item)) {
9399 PyErr_Format(PyExc_TypeError,
9400 "sequence item %zd: expected str instance,"
9401 " %.80s found",
9402 i, Py_TYPE(item)->tp_name);
9403 goto onError;
9404 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009405 if (PyUnicode_READY(item) == -1)
9406 goto onError;
9407 sz += PyUnicode_GET_LENGTH(item);
9408 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnere6abb482012-05-02 01:15:40 +02009409 maxchar = MAX_MAXCHAR(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009410 if (i != 0)
9411 sz += seplen;
9412 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9413 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009414 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009415 goto onError;
9416 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009417 if (use_memcpy && last_obj != NULL) {
9418 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9419 use_memcpy = 0;
9420 }
9421 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009422 }
Tim Petersced69f82003-09-16 20:30:58 +00009423
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009424 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009425 if (res == NULL)
9426 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009427
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009428 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009429#ifdef Py_DEBUG
9430 use_memcpy = 0;
9431#else
9432 if (use_memcpy) {
9433 res_data = PyUnicode_1BYTE_DATA(res);
9434 kind = PyUnicode_KIND(res);
9435 if (seplen != 0)
9436 sep_data = PyUnicode_1BYTE_DATA(sep);
9437 }
9438#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009439 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009440 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009441 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009442 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009443 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009444 if (use_memcpy) {
9445 Py_MEMCPY(res_data,
9446 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009447 kind * seplen);
9448 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009449 }
9450 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009451 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009452 res_offset += seplen;
9453 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009454 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009455 itemlen = PyUnicode_GET_LENGTH(item);
9456 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009457 if (use_memcpy) {
9458 Py_MEMCPY(res_data,
9459 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009460 kind * itemlen);
9461 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009462 }
9463 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009464 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009465 res_offset += itemlen;
9466 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009467 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009468 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009469 if (use_memcpy)
9470 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009471 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009472 else
9473 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009474
Tim Peters05eba1f2004-08-27 21:32:02 +00009475 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009476 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009477 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009478 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009479
Benjamin Peterson29060642009-01-31 22:14:21 +00009480 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009481 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009482 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009483 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009484 return NULL;
9485}
9486
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009487#define FILL(kind, data, value, start, length) \
9488 do { \
9489 Py_ssize_t i_ = 0; \
9490 assert(kind != PyUnicode_WCHAR_KIND); \
9491 switch ((kind)) { \
9492 case PyUnicode_1BYTE_KIND: { \
9493 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +02009494 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009495 break; \
9496 } \
9497 case PyUnicode_2BYTE_KIND: { \
9498 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9499 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9500 break; \
9501 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009502 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009503 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9504 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9505 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009506 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009507 } \
9508 } \
9509 } while (0)
9510
Victor Stinnerd3f08822012-05-29 12:57:52 +02009511void
9512_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9513 Py_UCS4 fill_char)
9514{
9515 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9516 const void *data = PyUnicode_DATA(unicode);
9517 assert(PyUnicode_IS_READY(unicode));
9518 assert(unicode_modifiable(unicode));
9519 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9520 assert(start >= 0);
9521 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9522 FILL(kind, data, fill_char, start, length);
9523}
9524
Victor Stinner3fe55312012-01-04 00:33:50 +01009525Py_ssize_t
9526PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9527 Py_UCS4 fill_char)
9528{
9529 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +01009530
9531 if (!PyUnicode_Check(unicode)) {
9532 PyErr_BadInternalCall();
9533 return -1;
9534 }
9535 if (PyUnicode_READY(unicode) == -1)
9536 return -1;
9537 if (unicode_check_modifiable(unicode))
9538 return -1;
9539
Victor Stinnerd3f08822012-05-29 12:57:52 +02009540 if (start < 0) {
9541 PyErr_SetString(PyExc_IndexError, "string index out of range");
9542 return -1;
9543 }
Victor Stinner3fe55312012-01-04 00:33:50 +01009544 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9545 PyErr_SetString(PyExc_ValueError,
9546 "fill character is bigger than "
9547 "the string maximum character");
9548 return -1;
9549 }
9550
9551 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9552 length = Py_MIN(maxlen, length);
9553 if (length <= 0)
9554 return 0;
9555
Victor Stinnerd3f08822012-05-29 12:57:52 +02009556 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +01009557 return length;
9558}
9559
Victor Stinner9310abb2011-10-05 00:59:23 +02009560static PyObject *
9561pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009562 Py_ssize_t left,
9563 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009564 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009565{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009566 PyObject *u;
9567 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009568 int kind;
9569 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009570
9571 if (left < 0)
9572 left = 0;
9573 if (right < 0)
9574 right = 0;
9575
Victor Stinnerc4b49542011-12-11 22:44:26 +01009576 if (left == 0 && right == 0)
9577 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009578
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009579 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9580 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009581 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9582 return NULL;
9583 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009584 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02009585 maxchar = MAX_MAXCHAR(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009586 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009587 if (!u)
9588 return NULL;
9589
9590 kind = PyUnicode_KIND(u);
9591 data = PyUnicode_DATA(u);
9592 if (left)
9593 FILL(kind, data, fill, 0, left);
9594 if (right)
9595 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +02009596 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009597 assert(_PyUnicode_CheckConsistency(u, 1));
9598 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009599}
9600
Alexander Belopolsky40018472011-02-26 01:02:56 +00009601PyObject *
9602PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009603{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009604 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009605
9606 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009607 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009608 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009609 if (PyUnicode_READY(string) == -1) {
9610 Py_DECREF(string);
9611 return NULL;
9612 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009613
Benjamin Petersonead6b532011-12-20 17:23:42 -06009614 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009615 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009616 if (PyUnicode_IS_ASCII(string))
9617 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009618 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009619 PyUnicode_GET_LENGTH(string), keepends);
9620 else
9621 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009622 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009623 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009624 break;
9625 case PyUnicode_2BYTE_KIND:
9626 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009627 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009628 PyUnicode_GET_LENGTH(string), keepends);
9629 break;
9630 case PyUnicode_4BYTE_KIND:
9631 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009632 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009633 PyUnicode_GET_LENGTH(string), keepends);
9634 break;
9635 default:
9636 assert(0);
9637 list = 0;
9638 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009639 Py_DECREF(string);
9640 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009641}
9642
Alexander Belopolsky40018472011-02-26 01:02:56 +00009643static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009644split(PyObject *self,
9645 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009646 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009647{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009648 int kind1, kind2, kind;
9649 void *buf1, *buf2;
9650 Py_ssize_t len1, len2;
9651 PyObject* out;
9652
Guido van Rossumd57fd912000-03-10 22:53:23 +00009653 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009654 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009655
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009656 if (PyUnicode_READY(self) == -1)
9657 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009658
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009659 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009660 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009661 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009662 if (PyUnicode_IS_ASCII(self))
9663 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009664 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009665 PyUnicode_GET_LENGTH(self), maxcount
9666 );
9667 else
9668 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009669 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009670 PyUnicode_GET_LENGTH(self), maxcount
9671 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009672 case PyUnicode_2BYTE_KIND:
9673 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009674 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009675 PyUnicode_GET_LENGTH(self), maxcount
9676 );
9677 case PyUnicode_4BYTE_KIND:
9678 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009679 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009680 PyUnicode_GET_LENGTH(self), maxcount
9681 );
9682 default:
9683 assert(0);
9684 return NULL;
9685 }
9686
9687 if (PyUnicode_READY(substring) == -1)
9688 return NULL;
9689
9690 kind1 = PyUnicode_KIND(self);
9691 kind2 = PyUnicode_KIND(substring);
9692 kind = kind1 > kind2 ? kind1 : kind2;
9693 buf1 = PyUnicode_DATA(self);
9694 buf2 = PyUnicode_DATA(substring);
9695 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009696 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009697 if (!buf1)
9698 return NULL;
9699 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009700 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009701 if (!buf2) {
9702 if (kind1 != kind) PyMem_Free(buf1);
9703 return NULL;
9704 }
9705 len1 = PyUnicode_GET_LENGTH(self);
9706 len2 = PyUnicode_GET_LENGTH(substring);
9707
Benjamin Petersonead6b532011-12-20 17:23:42 -06009708 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009709 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009710 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9711 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009712 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009713 else
9714 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009715 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009716 break;
9717 case PyUnicode_2BYTE_KIND:
9718 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009719 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009720 break;
9721 case PyUnicode_4BYTE_KIND:
9722 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009723 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009724 break;
9725 default:
9726 out = NULL;
9727 }
9728 if (kind1 != kind)
9729 PyMem_Free(buf1);
9730 if (kind2 != kind)
9731 PyMem_Free(buf2);
9732 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009733}
9734
Alexander Belopolsky40018472011-02-26 01:02:56 +00009735static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009736rsplit(PyObject *self,
9737 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009738 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009739{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009740 int kind1, kind2, kind;
9741 void *buf1, *buf2;
9742 Py_ssize_t len1, len2;
9743 PyObject* out;
9744
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009745 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009746 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009747
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009748 if (PyUnicode_READY(self) == -1)
9749 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009750
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009751 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009752 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009753 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009754 if (PyUnicode_IS_ASCII(self))
9755 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009756 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009757 PyUnicode_GET_LENGTH(self), maxcount
9758 );
9759 else
9760 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009761 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009762 PyUnicode_GET_LENGTH(self), maxcount
9763 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009764 case PyUnicode_2BYTE_KIND:
9765 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009766 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009767 PyUnicode_GET_LENGTH(self), maxcount
9768 );
9769 case PyUnicode_4BYTE_KIND:
9770 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009771 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009772 PyUnicode_GET_LENGTH(self), maxcount
9773 );
9774 default:
9775 assert(0);
9776 return NULL;
9777 }
9778
9779 if (PyUnicode_READY(substring) == -1)
9780 return NULL;
9781
9782 kind1 = PyUnicode_KIND(self);
9783 kind2 = PyUnicode_KIND(substring);
9784 kind = kind1 > kind2 ? kind1 : kind2;
9785 buf1 = PyUnicode_DATA(self);
9786 buf2 = PyUnicode_DATA(substring);
9787 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009788 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009789 if (!buf1)
9790 return NULL;
9791 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009792 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009793 if (!buf2) {
9794 if (kind1 != kind) PyMem_Free(buf1);
9795 return NULL;
9796 }
9797 len1 = PyUnicode_GET_LENGTH(self);
9798 len2 = PyUnicode_GET_LENGTH(substring);
9799
Benjamin Petersonead6b532011-12-20 17:23:42 -06009800 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009801 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009802 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9803 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009804 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009805 else
9806 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009807 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009808 break;
9809 case PyUnicode_2BYTE_KIND:
9810 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009811 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009812 break;
9813 case PyUnicode_4BYTE_KIND:
9814 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009815 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009816 break;
9817 default:
9818 out = NULL;
9819 }
9820 if (kind1 != kind)
9821 PyMem_Free(buf1);
9822 if (kind2 != kind)
9823 PyMem_Free(buf2);
9824 return out;
9825}
9826
9827static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009828anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9829 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009830{
Benjamin Petersonead6b532011-12-20 17:23:42 -06009831 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009832 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009833 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9834 return asciilib_find(buf1, len1, buf2, len2, offset);
9835 else
9836 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009837 case PyUnicode_2BYTE_KIND:
9838 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9839 case PyUnicode_4BYTE_KIND:
9840 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9841 }
9842 assert(0);
9843 return -1;
9844}
9845
9846static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009847anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
9848 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009849{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -06009850 switch (kind) {
9851 case PyUnicode_1BYTE_KIND:
9852 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
9853 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
9854 else
9855 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
9856 case PyUnicode_2BYTE_KIND:
9857 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9858 case PyUnicode_4BYTE_KIND:
9859 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9860 }
9861 assert(0);
9862 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009863}
9864
Alexander Belopolsky40018472011-02-26 01:02:56 +00009865static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009866replace(PyObject *self, PyObject *str1,
9867 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009868{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009869 PyObject *u;
9870 char *sbuf = PyUnicode_DATA(self);
9871 char *buf1 = PyUnicode_DATA(str1);
9872 char *buf2 = PyUnicode_DATA(str2);
9873 int srelease = 0, release1 = 0, release2 = 0;
9874 int skind = PyUnicode_KIND(self);
9875 int kind1 = PyUnicode_KIND(str1);
9876 int kind2 = PyUnicode_KIND(str2);
9877 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9878 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9879 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +02009880 int mayshrink;
9881 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009882
9883 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009884 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009885 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009886 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009887
Victor Stinner59de0ee2011-10-07 10:01:28 +02009888 if (str1 == str2)
9889 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009890 if (skind < kind1)
9891 /* substring too wide to be present */
9892 goto nothing;
9893
Victor Stinner49a0a212011-10-12 23:46:10 +02009894 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9895 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
9896 /* Replacing str1 with str2 may cause a maxchar reduction in the
9897 result string. */
9898 mayshrink = (maxchar_str2 < maxchar);
Victor Stinnere6abb482012-05-02 01:15:40 +02009899 maxchar = MAX_MAXCHAR(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +02009900
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009901 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009902 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009903 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009904 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009905 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009906 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +02009907 Py_UCS4 u1, u2;
9908 int rkind;
Victor Stinnerf6441102011-12-18 02:43:08 +01009909 Py_ssize_t index, pos;
9910 char *src;
9911
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009912 u1 = PyUnicode_READ_CHAR(str1, 0);
Victor Stinnerf6441102011-12-18 02:43:08 +01009913 pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
9914 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009915 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009916 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009917 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009918 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009919 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +02009920 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009921 rkind = PyUnicode_KIND(u);
Victor Stinnerf6441102011-12-18 02:43:08 +01009922
9923 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2);
9924 index = 0;
9925 src = sbuf;
9926 while (--maxcount)
9927 {
9928 pos++;
9929 src += pos * PyUnicode_KIND(self);
9930 slen -= pos;
9931 index += pos;
9932 pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
9933 if (pos < 0)
9934 break;
9935 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2);
9936 }
Victor Stinner49a0a212011-10-12 23:46:10 +02009937 }
9938 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009939 int rkind = skind;
9940 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +01009941 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +02009942
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009943 if (kind1 < rkind) {
9944 /* widen substring */
9945 buf1 = _PyUnicode_AsKind(str1, rkind);
9946 if (!buf1) goto error;
9947 release1 = 1;
9948 }
Victor Stinnerc3cec782011-10-05 21:24:08 +02009949 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009950 if (i < 0)
9951 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009952 if (rkind > kind2) {
9953 /* widen replacement */
9954 buf2 = _PyUnicode_AsKind(str2, rkind);
9955 if (!buf2) goto error;
9956 release2 = 1;
9957 }
9958 else if (rkind < kind2) {
9959 /* widen self and buf1 */
9960 rkind = kind2;
9961 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +01009962 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009963 sbuf = _PyUnicode_AsKind(self, rkind);
9964 if (!sbuf) goto error;
9965 srelease = 1;
9966 buf1 = _PyUnicode_AsKind(str1, rkind);
9967 if (!buf1) goto error;
9968 release1 = 1;
9969 }
Victor Stinner49a0a212011-10-12 23:46:10 +02009970 u = PyUnicode_New(slen, maxchar);
9971 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009972 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +02009973 assert(PyUnicode_KIND(u) == rkind);
9974 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +02009975
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009976 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009977 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009978 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009979 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009980 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009981 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009982
9983 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +02009984 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009985 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009986 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009987 if (i == -1)
9988 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009989 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009990 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009991 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009992 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009993 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009994 }
Victor Stinner49a0a212011-10-12 23:46:10 +02009995 }
9996 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009997 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01009998 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009999 int rkind = skind;
10000 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010001
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010002 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010003 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010004 buf1 = _PyUnicode_AsKind(str1, rkind);
10005 if (!buf1) goto error;
10006 release1 = 1;
10007 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010008 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010009 if (n == 0)
10010 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010011 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010012 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010013 buf2 = _PyUnicode_AsKind(str2, rkind);
10014 if (!buf2) goto error;
10015 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010016 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010017 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010018 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010019 rkind = kind2;
10020 sbuf = _PyUnicode_AsKind(self, rkind);
10021 if (!sbuf) goto error;
10022 srelease = 1;
10023 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010024 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010025 buf1 = _PyUnicode_AsKind(str1, rkind);
10026 if (!buf1) goto error;
10027 release1 = 1;
10028 }
10029 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10030 PyUnicode_GET_LENGTH(str1))); */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010031 if (len2 > len1 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010032 PyErr_SetString(PyExc_OverflowError,
10033 "replace string is too long");
10034 goto error;
10035 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010036 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010037 if (new_size == 0) {
10038 Py_INCREF(unicode_empty);
10039 u = unicode_empty;
10040 goto done;
10041 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010042 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010043 PyErr_SetString(PyExc_OverflowError,
10044 "replace string is too long");
10045 goto error;
10046 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010047 u = PyUnicode_New(new_size, maxchar);
10048 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010049 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010050 assert(PyUnicode_KIND(u) == rkind);
10051 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010052 ires = i = 0;
10053 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010054 while (n-- > 0) {
10055 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010056 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010057 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010058 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010059 if (j == -1)
10060 break;
10061 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010062 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010063 memcpy(res + rkind * ires,
10064 sbuf + rkind * i,
10065 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010066 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010067 }
10068 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010069 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010070 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010071 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010072 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010073 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010074 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010075 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010076 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010077 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010078 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010079 memcpy(res + rkind * ires,
10080 sbuf + rkind * i,
10081 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010082 }
10083 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010084 /* interleave */
10085 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010086 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010087 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010088 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010089 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010090 if (--n <= 0)
10091 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010092 memcpy(res + rkind * ires,
10093 sbuf + rkind * i,
10094 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010095 ires++;
10096 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010097 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010098 memcpy(res + rkind * ires,
10099 sbuf + rkind * i,
10100 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010101 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010102 }
10103
10104 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010105 unicode_adjust_maxchar(&u);
10106 if (u == NULL)
10107 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010108 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010109
10110 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010111 if (srelease)
10112 PyMem_FREE(sbuf);
10113 if (release1)
10114 PyMem_FREE(buf1);
10115 if (release2)
10116 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010117 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010118 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010119
Benjamin Peterson29060642009-01-31 22:14:21 +000010120 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010121 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010122 if (srelease)
10123 PyMem_FREE(sbuf);
10124 if (release1)
10125 PyMem_FREE(buf1);
10126 if (release2)
10127 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010128 return unicode_result_unchanged(self);
10129
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010130 error:
10131 if (srelease && sbuf)
10132 PyMem_FREE(sbuf);
10133 if (release1 && buf1)
10134 PyMem_FREE(buf1);
10135 if (release2 && buf2)
10136 PyMem_FREE(buf2);
10137 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010138}
10139
10140/* --- Unicode Object Methods --------------------------------------------- */
10141
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010142PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010143 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010144\n\
10145Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010146characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010147
10148static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010149unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010150{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010151 if (PyUnicode_READY(self) == -1)
10152 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010153 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010154}
10155
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010156PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010157 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010158\n\
10159Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010160have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010161
10162static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010163unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010164{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010165 if (PyUnicode_READY(self) == -1)
10166 return NULL;
10167 if (PyUnicode_GET_LENGTH(self) == 0)
10168 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010169 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010170}
10171
Benjamin Petersond5890c82012-01-14 13:23:30 -050010172PyDoc_STRVAR(casefold__doc__,
10173 "S.casefold() -> str\n\
10174\n\
10175Return a version of S suitable for caseless comparisons.");
10176
10177static PyObject *
10178unicode_casefold(PyObject *self)
10179{
10180 if (PyUnicode_READY(self) == -1)
10181 return NULL;
10182 if (PyUnicode_IS_ASCII(self))
10183 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010184 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010185}
10186
10187
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010188/* Argument converter. Coerces to a single unicode character */
10189
10190static int
10191convert_uc(PyObject *obj, void *addr)
10192{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010193 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010194 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010195
Benjamin Peterson14339b62009-01-31 16:36:08 +000010196 uniobj = PyUnicode_FromObject(obj);
10197 if (uniobj == NULL) {
10198 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010199 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010200 return 0;
10201 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010202 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010203 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010204 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010205 Py_DECREF(uniobj);
10206 return 0;
10207 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010208 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010209 Py_DECREF(uniobj);
10210 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010211}
10212
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010213PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010214 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010215\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010216Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010217done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010218
10219static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010220unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010221{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010222 Py_ssize_t marg, left;
10223 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010224 Py_UCS4 fillchar = ' ';
10225
Victor Stinnere9a29352011-10-01 02:14:59 +020010226 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010227 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010228
Benjamin Petersonbac79492012-01-14 13:34:47 -050010229 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010230 return NULL;
10231
Victor Stinnerc4b49542011-12-11 22:44:26 +010010232 if (PyUnicode_GET_LENGTH(self) >= width)
10233 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010234
Victor Stinnerc4b49542011-12-11 22:44:26 +010010235 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010236 left = marg / 2 + (marg & width & 1);
10237
Victor Stinner9310abb2011-10-05 00:59:23 +020010238 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010239}
10240
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010241/* This function assumes that str1 and str2 are readied by the caller. */
10242
Marc-André Lemburge5034372000-08-08 08:04:29 +000010243static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010244unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010245{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010246 int kind1, kind2;
10247 void *data1, *data2;
Victor Stinner770e19e2012-10-04 22:59:45 +020010248 Py_ssize_t len1, len2;
10249 Py_ssize_t i, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010250
Victor Stinner90db9c42012-10-04 21:53:50 +020010251 /* a string is equal to itself */
10252 if (str1 == str2)
10253 return 0;
10254
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010255 kind1 = PyUnicode_KIND(str1);
10256 kind2 = PyUnicode_KIND(str2);
10257 data1 = PyUnicode_DATA(str1);
10258 data2 = PyUnicode_DATA(str2);
10259 len1 = PyUnicode_GET_LENGTH(str1);
10260 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010261 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010262
Victor Stinner770e19e2012-10-04 22:59:45 +020010263 if (kind1 == 1 && kind2 == 1) {
10264 int cmp = memcmp(data1, data2, len);
10265 /* normalize result of memcmp() into the range [-1; 1] */
10266 if (cmp < 0)
10267 return -1;
10268 if (cmp > 0)
10269 return 1;
10270 }
10271 else {
10272 for (i = 0; i < len; ++i) {
10273 Py_UCS4 c1, c2;
10274 c1 = PyUnicode_READ(kind1, data1, i);
10275 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010276
Victor Stinner770e19e2012-10-04 22:59:45 +020010277 if (c1 != c2)
10278 return (c1 < c2) ? -1 : 1;
10279 }
Marc-André Lemburge5034372000-08-08 08:04:29 +000010280 }
10281
Victor Stinner770e19e2012-10-04 22:59:45 +020010282 if (len1 == len2)
10283 return 0;
10284 if (len1 < len2)
10285 return -1;
10286 else
10287 return 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010288}
10289
Victor Stinnere5567ad2012-10-23 02:48:49 +020010290static int
10291unicode_compare_eq(PyObject *str1, PyObject *str2)
10292{
10293 int kind;
10294 void *data1, *data2;
10295 Py_ssize_t len;
10296 int cmp;
10297
10298 /* a string is equal to itself */
10299 if (str1 == str2)
10300 return 1;
10301
10302 len = PyUnicode_GET_LENGTH(str1);
10303 if (PyUnicode_GET_LENGTH(str2) != len)
10304 return 0;
10305 kind = PyUnicode_KIND(str1);
10306 if (PyUnicode_KIND(str2) != kind)
10307 return 0;
10308 data1 = PyUnicode_DATA(str1);
10309 data2 = PyUnicode_DATA(str2);
10310
10311 cmp = memcmp(data1, data2, len * kind);
10312 return (cmp == 0);
10313}
10314
10315
Alexander Belopolsky40018472011-02-26 01:02:56 +000010316int
10317PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010318{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010319 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10320 if (PyUnicode_READY(left) == -1 ||
10321 PyUnicode_READY(right) == -1)
10322 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010323 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010324 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010325 PyErr_Format(PyExc_TypeError,
10326 "Can't compare %.100s and %.100s",
10327 left->ob_type->tp_name,
10328 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010329 return -1;
10330}
10331
Martin v. Löwis5b222132007-06-10 09:51:05 +000010332int
10333PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10334{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010335 Py_ssize_t i;
10336 int kind;
10337 void *data;
10338 Py_UCS4 chr;
10339
Victor Stinner910337b2011-10-03 03:20:16 +020010340 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010341 if (PyUnicode_READY(uni) == -1)
10342 return -1;
10343 kind = PyUnicode_KIND(uni);
10344 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010345 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010346 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10347 if (chr != str[i])
10348 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010349 /* This check keeps Python strings that end in '\0' from comparing equal
10350 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010351 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010352 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010353 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010354 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010355 return 0;
10356}
10357
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010358
Benjamin Peterson29060642009-01-31 22:14:21 +000010359#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010360 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010361
Alexander Belopolsky40018472011-02-26 01:02:56 +000010362PyObject *
10363PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010364{
10365 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020010366 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010367
Victor Stinnere5567ad2012-10-23 02:48:49 +020010368 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
10369 Py_RETURN_NOTIMPLEMENTED;
10370
10371 if (PyUnicode_READY(left) == -1 ||
10372 PyUnicode_READY(right) == -1)
10373 return NULL;
10374
10375 if (op == Py_EQ || op == Py_NE) {
10376 result = unicode_compare_eq(left, right);
10377 if (op == Py_EQ)
10378 v = TEST_COND(result);
10379 else
10380 v = TEST_COND(!result);
10381 }
10382 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020010383 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010384
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010385 /* Convert the return value to a Boolean */
10386 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010387 case Py_LE:
10388 v = TEST_COND(result <= 0);
10389 break;
10390 case Py_GE:
10391 v = TEST_COND(result >= 0);
10392 break;
10393 case Py_LT:
10394 v = TEST_COND(result == -1);
10395 break;
10396 case Py_GT:
10397 v = TEST_COND(result == 1);
10398 break;
10399 default:
10400 PyErr_BadArgument();
10401 return NULL;
10402 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010403 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020010404 Py_INCREF(v);
10405 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010406}
10407
Alexander Belopolsky40018472011-02-26 01:02:56 +000010408int
10409PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010410{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010411 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010412 int kind1, kind2, kind;
10413 void *buf1, *buf2;
10414 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010415 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010416
10417 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010418 sub = PyUnicode_FromObject(element);
10419 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010420 PyErr_Format(PyExc_TypeError,
10421 "'in <string>' requires string as left operand, not %s",
10422 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010423 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010424 }
10425
Thomas Wouters477c8d52006-05-27 19:21:47 +000010426 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010427 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010428 Py_DECREF(sub);
10429 return -1;
10430 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060010431 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
10432 Py_DECREF(sub);
10433 Py_DECREF(str);
10434 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010435
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010436 kind1 = PyUnicode_KIND(str);
10437 kind2 = PyUnicode_KIND(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010438 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010439 buf1 = PyUnicode_DATA(str);
10440 buf2 = PyUnicode_DATA(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010441 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +020010442 if (kind2 > kind) {
10443 Py_DECREF(sub);
10444 Py_DECREF(str);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010445 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +020010446 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010010447 buf2 = _PyUnicode_AsKind(sub, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010448 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010449 if (!buf2) {
10450 Py_DECREF(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010451 Py_DECREF(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010452 return -1;
10453 }
10454 len1 = PyUnicode_GET_LENGTH(str);
10455 len2 = PyUnicode_GET_LENGTH(sub);
10456
Benjamin Petersonead6b532011-12-20 17:23:42 -060010457 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010458 case PyUnicode_1BYTE_KIND:
10459 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10460 break;
10461 case PyUnicode_2BYTE_KIND:
10462 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10463 break;
10464 case PyUnicode_4BYTE_KIND:
10465 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10466 break;
10467 default:
10468 result = -1;
10469 assert(0);
10470 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010471
10472 Py_DECREF(str);
10473 Py_DECREF(sub);
10474
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010475 if (kind2 != kind)
10476 PyMem_Free(buf2);
10477
Guido van Rossum403d68b2000-03-13 15:55:09 +000010478 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010479}
10480
Guido van Rossumd57fd912000-03-10 22:53:23 +000010481/* Concat to string or Unicode object giving a new Unicode object. */
10482
Alexander Belopolsky40018472011-02-26 01:02:56 +000010483PyObject *
10484PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010485{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010486 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010487 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010488 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010489
10490 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010491 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010492 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010493 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010494 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010495 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010496 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010497
10498 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010499 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010500 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010501 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010502 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010503 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010504 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010505 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010506 }
10507
Victor Stinner488fa492011-12-12 00:01:39 +010010508 u_len = PyUnicode_GET_LENGTH(u);
10509 v_len = PyUnicode_GET_LENGTH(v);
10510 if (u_len > PY_SSIZE_T_MAX - v_len) {
10511 PyErr_SetString(PyExc_OverflowError,
10512 "strings are too large to concat");
10513 goto onError;
10514 }
10515 new_len = u_len + v_len;
10516
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010517 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010518 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Victor Stinnere6abb482012-05-02 01:15:40 +020010519 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010520
Guido van Rossumd57fd912000-03-10 22:53:23 +000010521 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010010522 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010523 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010524 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010525 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
10526 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010527 Py_DECREF(u);
10528 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010529 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010530 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010531
Benjamin Peterson29060642009-01-31 22:14:21 +000010532 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010533 Py_XDECREF(u);
10534 Py_XDECREF(v);
10535 return NULL;
10536}
10537
Walter Dörwald1ab83302007-05-18 17:15:44 +000010538void
Victor Stinner23e56682011-10-03 03:54:37 +020010539PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010540{
Victor Stinner23e56682011-10-03 03:54:37 +020010541 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010010542 Py_UCS4 maxchar, maxchar2;
10543 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020010544
10545 if (p_left == NULL) {
10546 if (!PyErr_Occurred())
10547 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010548 return;
10549 }
Victor Stinner23e56682011-10-03 03:54:37 +020010550 left = *p_left;
10551 if (right == NULL || !PyUnicode_Check(left)) {
10552 if (!PyErr_Occurred())
10553 PyErr_BadInternalCall();
10554 goto error;
10555 }
10556
Benjamin Petersonbac79492012-01-14 13:34:47 -050010557 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010558 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050010559 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010560 goto error;
10561
Victor Stinner488fa492011-12-12 00:01:39 +010010562 /* Shortcuts */
10563 if (left == unicode_empty) {
10564 Py_DECREF(left);
10565 Py_INCREF(right);
10566 *p_left = right;
10567 return;
10568 }
10569 if (right == unicode_empty)
10570 return;
10571
10572 left_len = PyUnicode_GET_LENGTH(left);
10573 right_len = PyUnicode_GET_LENGTH(right);
10574 if (left_len > PY_SSIZE_T_MAX - right_len) {
10575 PyErr_SetString(PyExc_OverflowError,
10576 "strings are too large to concat");
10577 goto error;
10578 }
10579 new_len = left_len + right_len;
10580
10581 if (unicode_modifiable(left)
10582 && PyUnicode_CheckExact(right)
10583 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020010584 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10585 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010586 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010587 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010010588 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10589 {
10590 /* append inplace */
10591 if (unicode_resize(p_left, new_len) != 0) {
10592 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10593 * deallocated so it cannot be put back into
10594 * 'variable'. The MemoryError is raised when there
10595 * is no value in 'variable', which might (very
10596 * remotely) be a cause of incompatibilities.
10597 */
10598 goto error;
Victor Stinner23e56682011-10-03 03:54:37 +020010599 }
Victor Stinner488fa492011-12-12 00:01:39 +010010600 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerd3f08822012-05-29 12:57:52 +020010601 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020010602 }
Victor Stinner488fa492011-12-12 00:01:39 +010010603 else {
10604 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10605 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Victor Stinnere6abb482012-05-02 01:15:40 +020010606 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020010607
Victor Stinner488fa492011-12-12 00:01:39 +010010608 /* Concat the two Unicode strings */
10609 res = PyUnicode_New(new_len, maxchar);
10610 if (res == NULL)
10611 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010612 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
10613 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010010614 Py_DECREF(left);
10615 *p_left = res;
10616 }
10617 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010618 return;
10619
10620error:
Victor Stinner488fa492011-12-12 00:01:39 +010010621 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010622}
10623
10624void
10625PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10626{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010627 PyUnicode_Append(pleft, right);
10628 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010629}
10630
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010631PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010632 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010633\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010634Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010635string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010636interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010637
10638static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010639unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010640{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010641 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010642 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010643 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010644 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010645 int kind1, kind2, kind;
10646 void *buf1, *buf2;
10647 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010648
Jesus Ceaac451502011-04-20 17:09:23 +020010649 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10650 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010651 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010652
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010653 kind1 = PyUnicode_KIND(self);
10654 kind2 = PyUnicode_KIND(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040010655 if (kind2 > kind1)
10656 return PyLong_FromLong(0);
10657 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010658 buf1 = PyUnicode_DATA(self);
10659 buf2 = PyUnicode_DATA(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010660 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010661 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010662 if (!buf2) {
10663 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010664 return NULL;
10665 }
10666 len1 = PyUnicode_GET_LENGTH(self);
10667 len2 = PyUnicode_GET_LENGTH(substring);
10668
10669 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060010670 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010671 case PyUnicode_1BYTE_KIND:
10672 iresult = ucs1lib_count(
10673 ((Py_UCS1*)buf1) + start, end - start,
10674 buf2, len2, PY_SSIZE_T_MAX
10675 );
10676 break;
10677 case PyUnicode_2BYTE_KIND:
10678 iresult = ucs2lib_count(
10679 ((Py_UCS2*)buf1) + start, end - start,
10680 buf2, len2, PY_SSIZE_T_MAX
10681 );
10682 break;
10683 case PyUnicode_4BYTE_KIND:
10684 iresult = ucs4lib_count(
10685 ((Py_UCS4*)buf1) + start, end - start,
10686 buf2, len2, PY_SSIZE_T_MAX
10687 );
10688 break;
10689 default:
10690 assert(0); iresult = 0;
10691 }
10692
10693 result = PyLong_FromSsize_t(iresult);
10694
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010695 if (kind2 != kind)
10696 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010697
10698 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010699
Guido van Rossumd57fd912000-03-10 22:53:23 +000010700 return result;
10701}
10702
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010703PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010704 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010705\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010706Encode S using the codec registered for encoding. Default encoding\n\
10707is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010708handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010709a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10710'xmlcharrefreplace' as well as any other name registered with\n\
10711codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010712
10713static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010714unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010715{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010716 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010717 char *encoding = NULL;
10718 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010719
Benjamin Peterson308d6372009-09-18 21:42:35 +000010720 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10721 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010722 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010723 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010724}
10725
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010726PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010727 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010728\n\
10729Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010730If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010731
10732static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010733unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010734{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010735 Py_ssize_t i, j, line_pos, src_len, incr;
10736 Py_UCS4 ch;
10737 PyObject *u;
10738 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010739 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010740 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010741 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010742
10743 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010744 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010745
Antoine Pitrou22425222011-10-04 19:10:51 +020010746 if (PyUnicode_READY(self) == -1)
10747 return NULL;
10748
Thomas Wouters7e474022000-07-16 12:04:32 +000010749 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010750 src_len = PyUnicode_GET_LENGTH(self);
10751 i = j = line_pos = 0;
10752 kind = PyUnicode_KIND(self);
10753 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010754 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010755 for (; i < src_len; i++) {
10756 ch = PyUnicode_READ(kind, src_data, i);
10757 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010758 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010759 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010760 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010761 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010762 goto overflow;
10763 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010764 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010765 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010766 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010767 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010768 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010769 goto overflow;
10770 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010771 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010772 if (ch == '\n' || ch == '\r')
10773 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010774 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010775 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010010776 if (!found)
10777 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010778
Guido van Rossumd57fd912000-03-10 22:53:23 +000010779 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010780 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010781 if (!u)
10782 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010783 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010784
Antoine Pitroue71d5742011-10-04 15:55:09 +020010785 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010786
Antoine Pitroue71d5742011-10-04 15:55:09 +020010787 for (; i < src_len; i++) {
10788 ch = PyUnicode_READ(kind, src_data, i);
10789 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010790 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010791 incr = tabsize - (line_pos % tabsize);
10792 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010010793 FILL(kind, dest_data, ' ', j, incr);
10794 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010795 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010796 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010797 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010798 line_pos++;
10799 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010800 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010801 if (ch == '\n' || ch == '\r')
10802 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010803 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010804 }
10805 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010010806 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010807
Antoine Pitroue71d5742011-10-04 15:55:09 +020010808 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010809 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10810 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010811}
10812
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010813PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010814 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010815\n\
10816Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010817such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010818arguments start and end are interpreted as in slice notation.\n\
10819\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010820Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010821
10822static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010823unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010824{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010825 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010826 Py_ssize_t start;
10827 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010828 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010829
Jesus Ceaac451502011-04-20 17:09:23 +020010830 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10831 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010832 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010833
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010834 if (PyUnicode_READY(self) == -1)
10835 return NULL;
10836 if (PyUnicode_READY(substring) == -1)
10837 return NULL;
10838
Victor Stinner7931d9a2011-11-04 00:22:48 +010010839 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010840
10841 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010842
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010843 if (result == -2)
10844 return NULL;
10845
Christian Heimes217cfd12007-12-02 14:31:20 +000010846 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010847}
10848
10849static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010850unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010851{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020010852 void *data;
10853 enum PyUnicode_Kind kind;
10854 Py_UCS4 ch;
10855 PyObject *res;
10856
10857 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
10858 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010859 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020010860 }
10861 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
10862 PyErr_SetString(PyExc_IndexError, "string index out of range");
10863 return NULL;
10864 }
10865 kind = PyUnicode_KIND(self);
10866 data = PyUnicode_DATA(self);
10867 ch = PyUnicode_READ(kind, data, index);
10868 if (ch < 256)
10869 return get_latin1_char(ch);
10870
10871 res = PyUnicode_New(1, ch);
10872 if (res == NULL)
10873 return NULL;
10874 kind = PyUnicode_KIND(res);
10875 data = PyUnicode_DATA(res);
10876 PyUnicode_WRITE(kind, data, 0, ch);
10877 assert(_PyUnicode_CheckConsistency(res, 1));
10878 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010879}
10880
Guido van Rossumc2504932007-09-18 19:42:40 +000010881/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010882 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010883static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010884unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010885{
Guido van Rossumc2504932007-09-18 19:42:40 +000010886 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010887 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010888
Benjamin Petersonf6622c82012-04-09 14:53:07 -040010889#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050010890 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040010891#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010892 if (_PyUnicode_HASH(self) != -1)
10893 return _PyUnicode_HASH(self);
10894 if (PyUnicode_READY(self) == -1)
10895 return -1;
10896 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010010897 /*
10898 We make the hash of the empty string be 0, rather than using
10899 (prefix ^ suffix), since this slightly obfuscates the hash secret
10900 */
10901 if (len == 0) {
10902 _PyUnicode_HASH(self) = 0;
10903 return 0;
10904 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010905
10906 /* The hash function as a macro, gets expanded three times below. */
Georg Brandl2fb477c2012-02-21 00:33:36 +010010907#define HASH(P) \
10908 x ^= (Py_uhash_t) *P << 7; \
10909 while (--len >= 0) \
10910 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010911
Georg Brandl2fb477c2012-02-21 00:33:36 +010010912 x = (Py_uhash_t) _Py_HashSecret.prefix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010913 switch (PyUnicode_KIND(self)) {
10914 case PyUnicode_1BYTE_KIND: {
10915 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10916 HASH(c);
10917 break;
10918 }
10919 case PyUnicode_2BYTE_KIND: {
10920 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10921 HASH(s);
10922 break;
10923 }
10924 default: {
10925 Py_UCS4 *l;
10926 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10927 "Impossible switch case in unicode_hash");
10928 l = PyUnicode_4BYTE_DATA(self);
10929 HASH(l);
10930 break;
10931 }
10932 }
Georg Brandl2fb477c2012-02-21 00:33:36 +010010933 x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
10934 x ^= (Py_uhash_t) _Py_HashSecret.suffix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010935
Guido van Rossumc2504932007-09-18 19:42:40 +000010936 if (x == -1)
10937 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010938 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010939 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010940}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010941#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000010942
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010943PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010944 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010945\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010946Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010947
10948static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010949unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010950{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010951 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010952 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010953 Py_ssize_t start;
10954 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010955
Jesus Ceaac451502011-04-20 17:09:23 +020010956 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10957 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010958 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010959
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010960 if (PyUnicode_READY(self) == -1)
10961 return NULL;
10962 if (PyUnicode_READY(substring) == -1)
10963 return NULL;
10964
Victor Stinner7931d9a2011-11-04 00:22:48 +010010965 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010966
10967 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010968
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010969 if (result == -2)
10970 return NULL;
10971
Guido van Rossumd57fd912000-03-10 22:53:23 +000010972 if (result < 0) {
10973 PyErr_SetString(PyExc_ValueError, "substring not found");
10974 return NULL;
10975 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010976
Christian Heimes217cfd12007-12-02 14:31:20 +000010977 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010978}
10979
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010980PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010981 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010982\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010983Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010984at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010985
10986static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010987unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010988{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010989 Py_ssize_t i, length;
10990 int kind;
10991 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010992 int cased;
10993
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010994 if (PyUnicode_READY(self) == -1)
10995 return NULL;
10996 length = PyUnicode_GET_LENGTH(self);
10997 kind = PyUnicode_KIND(self);
10998 data = PyUnicode_DATA(self);
10999
Guido van Rossumd57fd912000-03-10 22:53:23 +000011000 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011001 if (length == 1)
11002 return PyBool_FromLong(
11003 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011004
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011005 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011006 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011007 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011008
Guido van Rossumd57fd912000-03-10 22:53:23 +000011009 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011010 for (i = 0; i < length; i++) {
11011 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011012
Benjamin Peterson29060642009-01-31 22:14:21 +000011013 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11014 return PyBool_FromLong(0);
11015 else if (!cased && Py_UNICODE_ISLOWER(ch))
11016 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011017 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011018 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011019}
11020
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011021PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011022 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011023\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011024Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011025at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011026
11027static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011028unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011029{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011030 Py_ssize_t i, length;
11031 int kind;
11032 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011033 int cased;
11034
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011035 if (PyUnicode_READY(self) == -1)
11036 return NULL;
11037 length = PyUnicode_GET_LENGTH(self);
11038 kind = PyUnicode_KIND(self);
11039 data = PyUnicode_DATA(self);
11040
Guido van Rossumd57fd912000-03-10 22:53:23 +000011041 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011042 if (length == 1)
11043 return PyBool_FromLong(
11044 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011045
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011046 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011047 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011048 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011049
Guido van Rossumd57fd912000-03-10 22:53:23 +000011050 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011051 for (i = 0; i < length; i++) {
11052 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011053
Benjamin Peterson29060642009-01-31 22:14:21 +000011054 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11055 return PyBool_FromLong(0);
11056 else if (!cased && Py_UNICODE_ISUPPER(ch))
11057 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011058 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011059 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011060}
11061
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011062PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011063 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011064\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011065Return True if S is a titlecased string and there is at least one\n\
11066character in S, i.e. upper- and titlecase characters may only\n\
11067follow uncased characters and lowercase characters only cased ones.\n\
11068Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011069
11070static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011071unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011072{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011073 Py_ssize_t i, length;
11074 int kind;
11075 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011076 int cased, previous_is_cased;
11077
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011078 if (PyUnicode_READY(self) == -1)
11079 return NULL;
11080 length = PyUnicode_GET_LENGTH(self);
11081 kind = PyUnicode_KIND(self);
11082 data = PyUnicode_DATA(self);
11083
Guido van Rossumd57fd912000-03-10 22:53:23 +000011084 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011085 if (length == 1) {
11086 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11087 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11088 (Py_UNICODE_ISUPPER(ch) != 0));
11089 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011090
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011091 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011092 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011093 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011094
Guido van Rossumd57fd912000-03-10 22:53:23 +000011095 cased = 0;
11096 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011097 for (i = 0; i < length; i++) {
11098 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011099
Benjamin Peterson29060642009-01-31 22:14:21 +000011100 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11101 if (previous_is_cased)
11102 return PyBool_FromLong(0);
11103 previous_is_cased = 1;
11104 cased = 1;
11105 }
11106 else if (Py_UNICODE_ISLOWER(ch)) {
11107 if (!previous_is_cased)
11108 return PyBool_FromLong(0);
11109 previous_is_cased = 1;
11110 cased = 1;
11111 }
11112 else
11113 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011114 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011115 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011116}
11117
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011118PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011119 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011120\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011121Return True if all characters in S are whitespace\n\
11122and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011123
11124static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011125unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011126{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011127 Py_ssize_t i, length;
11128 int kind;
11129 void *data;
11130
11131 if (PyUnicode_READY(self) == -1)
11132 return NULL;
11133 length = PyUnicode_GET_LENGTH(self);
11134 kind = PyUnicode_KIND(self);
11135 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011136
Guido van Rossumd57fd912000-03-10 22:53:23 +000011137 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011138 if (length == 1)
11139 return PyBool_FromLong(
11140 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011141
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011142 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011143 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011144 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011145
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011146 for (i = 0; i < length; i++) {
11147 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011148 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011149 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011150 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011151 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011152}
11153
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011154PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011155 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011156\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011157Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011158and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011159
11160static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011161unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011162{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011163 Py_ssize_t i, length;
11164 int kind;
11165 void *data;
11166
11167 if (PyUnicode_READY(self) == -1)
11168 return NULL;
11169 length = PyUnicode_GET_LENGTH(self);
11170 kind = PyUnicode_KIND(self);
11171 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011172
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011173 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011174 if (length == 1)
11175 return PyBool_FromLong(
11176 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011177
11178 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011179 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011180 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011181
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011182 for (i = 0; i < length; i++) {
11183 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011184 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011185 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011186 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011187}
11188
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011189PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011190 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011191\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011192Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011193and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011194
11195static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011196unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011197{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011198 int kind;
11199 void *data;
11200 Py_ssize_t len, i;
11201
11202 if (PyUnicode_READY(self) == -1)
11203 return NULL;
11204
11205 kind = PyUnicode_KIND(self);
11206 data = PyUnicode_DATA(self);
11207 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011208
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011209 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011210 if (len == 1) {
11211 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11212 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11213 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011214
11215 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011216 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011217 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011218
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011219 for (i = 0; i < len; i++) {
11220 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011221 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011222 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011223 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011224 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011225}
11226
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011227PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011228 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011229\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011230Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011231False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011232
11233static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011234unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011235{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011236 Py_ssize_t i, length;
11237 int kind;
11238 void *data;
11239
11240 if (PyUnicode_READY(self) == -1)
11241 return NULL;
11242 length = PyUnicode_GET_LENGTH(self);
11243 kind = PyUnicode_KIND(self);
11244 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011245
Guido van Rossumd57fd912000-03-10 22:53:23 +000011246 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011247 if (length == 1)
11248 return PyBool_FromLong(
11249 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011250
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011251 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011252 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011253 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011254
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011255 for (i = 0; i < length; i++) {
11256 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011257 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011258 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011259 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011260}
11261
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011262PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011263 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011264\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011265Return True if all characters in S are digits\n\
11266and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011267
11268static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011269unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011270{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011271 Py_ssize_t i, length;
11272 int kind;
11273 void *data;
11274
11275 if (PyUnicode_READY(self) == -1)
11276 return NULL;
11277 length = PyUnicode_GET_LENGTH(self);
11278 kind = PyUnicode_KIND(self);
11279 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011280
Guido van Rossumd57fd912000-03-10 22:53:23 +000011281 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011282 if (length == 1) {
11283 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11284 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11285 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011286
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011287 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011288 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011289 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011290
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011291 for (i = 0; i < length; i++) {
11292 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011293 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011294 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011295 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011296}
11297
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011298PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011299 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011300\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011301Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011302False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011303
11304static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011305unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011306{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011307 Py_ssize_t i, length;
11308 int kind;
11309 void *data;
11310
11311 if (PyUnicode_READY(self) == -1)
11312 return NULL;
11313 length = PyUnicode_GET_LENGTH(self);
11314 kind = PyUnicode_KIND(self);
11315 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011316
Guido van Rossumd57fd912000-03-10 22:53:23 +000011317 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011318 if (length == 1)
11319 return PyBool_FromLong(
11320 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011321
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011322 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011323 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011324 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011325
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011326 for (i = 0; i < length; i++) {
11327 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011328 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011329 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011330 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011331}
11332
Martin v. Löwis47383402007-08-15 07:32:56 +000011333int
11334PyUnicode_IsIdentifier(PyObject *self)
11335{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011336 int kind;
11337 void *data;
11338 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011339 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011340
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011341 if (PyUnicode_READY(self) == -1) {
11342 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011343 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011344 }
11345
11346 /* Special case for empty strings */
11347 if (PyUnicode_GET_LENGTH(self) == 0)
11348 return 0;
11349 kind = PyUnicode_KIND(self);
11350 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011351
11352 /* PEP 3131 says that the first character must be in
11353 XID_Start and subsequent characters in XID_Continue,
11354 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011355 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011356 letters, digits, underscore). However, given the current
11357 definition of XID_Start and XID_Continue, it is sufficient
11358 to check just for these, except that _ must be allowed
11359 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011360 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011361 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011362 return 0;
11363
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011364 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011365 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011366 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011367 return 1;
11368}
11369
11370PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011371 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011372\n\
11373Return True if S is a valid identifier according\n\
11374to the language definition.");
11375
11376static PyObject*
11377unicode_isidentifier(PyObject *self)
11378{
11379 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11380}
11381
Georg Brandl559e5d72008-06-11 18:37:52 +000011382PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011383 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011384\n\
11385Return True if all characters in S are considered\n\
11386printable in repr() or S is empty, False otherwise.");
11387
11388static PyObject*
11389unicode_isprintable(PyObject *self)
11390{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011391 Py_ssize_t i, length;
11392 int kind;
11393 void *data;
11394
11395 if (PyUnicode_READY(self) == -1)
11396 return NULL;
11397 length = PyUnicode_GET_LENGTH(self);
11398 kind = PyUnicode_KIND(self);
11399 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011400
11401 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011402 if (length == 1)
11403 return PyBool_FromLong(
11404 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011405
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011406 for (i = 0; i < length; i++) {
11407 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011408 Py_RETURN_FALSE;
11409 }
11410 }
11411 Py_RETURN_TRUE;
11412}
11413
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011414PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011415 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011416\n\
11417Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011418iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011419
11420static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011421unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011422{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011423 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011424}
11425
Martin v. Löwis18e16552006-02-15 17:27:45 +000011426static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011427unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011428{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011429 if (PyUnicode_READY(self) == -1)
11430 return -1;
11431 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011432}
11433
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011434PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011435 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011436\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011437Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011438done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011439
11440static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011441unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011442{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011443 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011444 Py_UCS4 fillchar = ' ';
11445
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011446 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011447 return NULL;
11448
Benjamin Petersonbac79492012-01-14 13:34:47 -050011449 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011450 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011451
Victor Stinnerc4b49542011-12-11 22:44:26 +010011452 if (PyUnicode_GET_LENGTH(self) >= width)
11453 return unicode_result_unchanged(self);
11454
11455 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011456}
11457
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011458PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011459 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011460\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011461Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011462
11463static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011464unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011465{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011466 if (PyUnicode_READY(self) == -1)
11467 return NULL;
11468 if (PyUnicode_IS_ASCII(self))
11469 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011470 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011471}
11472
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011473#define LEFTSTRIP 0
11474#define RIGHTSTRIP 1
11475#define BOTHSTRIP 2
11476
11477/* Arrays indexed by above */
11478static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11479
11480#define STRIPNAME(i) (stripformat[i]+3)
11481
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011482/* externally visible for str.strip(unicode) */
11483PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011484_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011485{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011486 void *data;
11487 int kind;
11488 Py_ssize_t i, j, len;
11489 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011490
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011491 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11492 return NULL;
11493
11494 kind = PyUnicode_KIND(self);
11495 data = PyUnicode_DATA(self);
11496 len = PyUnicode_GET_LENGTH(self);
11497 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11498 PyUnicode_DATA(sepobj),
11499 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011500
Benjamin Peterson14339b62009-01-31 16:36:08 +000011501 i = 0;
11502 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011503 while (i < len &&
11504 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011505 i++;
11506 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011507 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011508
Benjamin Peterson14339b62009-01-31 16:36:08 +000011509 j = len;
11510 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011511 do {
11512 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011513 } while (j >= i &&
11514 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011515 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011516 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011517
Victor Stinner7931d9a2011-11-04 00:22:48 +010011518 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011519}
11520
11521PyObject*
11522PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11523{
11524 unsigned char *data;
11525 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011526 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011527
Victor Stinnerde636f32011-10-01 03:55:54 +020011528 if (PyUnicode_READY(self) == -1)
11529 return NULL;
11530
Victor Stinner684d5fd2012-05-03 02:32:34 +020011531 length = PyUnicode_GET_LENGTH(self);
11532 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020011533
Victor Stinner684d5fd2012-05-03 02:32:34 +020011534 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011535 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011536
Victor Stinnerde636f32011-10-01 03:55:54 +020011537 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011538 PyErr_SetString(PyExc_IndexError, "string index out of range");
11539 return NULL;
11540 }
Victor Stinner684d5fd2012-05-03 02:32:34 +020011541 if (start >= length || end < start) {
Victor Stinner3a7f79772012-05-03 03:36:40 +020011542 Py_INCREF(unicode_empty);
11543 return unicode_empty;
Victor Stinner684d5fd2012-05-03 02:32:34 +020011544 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020011545
Victor Stinner684d5fd2012-05-03 02:32:34 +020011546 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020011547 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020011548 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020011549 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020011550 }
11551 else {
11552 kind = PyUnicode_KIND(self);
11553 data = PyUnicode_1BYTE_DATA(self);
11554 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011555 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011556 length);
11557 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011558}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011559
11560static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011561do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011562{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011563 int kind;
11564 void *data;
11565 Py_ssize_t len, i, j;
11566
11567 if (PyUnicode_READY(self) == -1)
11568 return NULL;
11569
11570 kind = PyUnicode_KIND(self);
11571 data = PyUnicode_DATA(self);
11572 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011573
Benjamin Peterson14339b62009-01-31 16:36:08 +000011574 i = 0;
11575 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011576 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011577 i++;
11578 }
11579 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011580
Benjamin Peterson14339b62009-01-31 16:36:08 +000011581 j = len;
11582 if (striptype != LEFTSTRIP) {
11583 do {
11584 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011585 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011586 j++;
11587 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011588
Victor Stinner7931d9a2011-11-04 00:22:48 +010011589 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011590}
11591
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011592
11593static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011594do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011595{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011596 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011597
Benjamin Peterson14339b62009-01-31 16:36:08 +000011598 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11599 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011600
Benjamin Peterson14339b62009-01-31 16:36:08 +000011601 if (sep != NULL && sep != Py_None) {
11602 if (PyUnicode_Check(sep))
11603 return _PyUnicode_XStrip(self, striptype, sep);
11604 else {
11605 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011606 "%s arg must be None or str",
11607 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011608 return NULL;
11609 }
11610 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011611
Benjamin Peterson14339b62009-01-31 16:36:08 +000011612 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011613}
11614
11615
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011616PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011617 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011618\n\
11619Return a copy of the string S with leading and trailing\n\
11620whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011621If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011622
11623static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011624unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011625{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011626 if (PyTuple_GET_SIZE(args) == 0)
11627 return do_strip(self, BOTHSTRIP); /* Common case */
11628 else
11629 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011630}
11631
11632
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011633PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011634 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011635\n\
11636Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011637If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011638
11639static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011640unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011641{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011642 if (PyTuple_GET_SIZE(args) == 0)
11643 return do_strip(self, LEFTSTRIP); /* Common case */
11644 else
11645 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011646}
11647
11648
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011649PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011650 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011651\n\
11652Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011653If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011654
11655static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011656unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011657{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011658 if (PyTuple_GET_SIZE(args) == 0)
11659 return do_strip(self, RIGHTSTRIP); /* Common case */
11660 else
11661 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011662}
11663
11664
Guido van Rossumd57fd912000-03-10 22:53:23 +000011665static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011666unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011667{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011668 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011669 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011670
Georg Brandl222de0f2009-04-12 12:01:50 +000011671 if (len < 1) {
11672 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011673 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011674 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011675
Victor Stinnerc4b49542011-12-11 22:44:26 +010011676 /* no repeat, return original string */
11677 if (len == 1)
11678 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000011679
Benjamin Petersonbac79492012-01-14 13:34:47 -050011680 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011681 return NULL;
11682
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011683 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011684 PyErr_SetString(PyExc_OverflowError,
11685 "repeated string is too long");
11686 return NULL;
11687 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011688 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011689
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011690 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011691 if (!u)
11692 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011693 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011694
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011695 if (PyUnicode_GET_LENGTH(str) == 1) {
11696 const int kind = PyUnicode_KIND(str);
11697 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010011698 if (kind == PyUnicode_1BYTE_KIND) {
11699 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011700 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010011701 }
11702 else if (kind == PyUnicode_2BYTE_KIND) {
11703 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011704 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010011705 ucs2[n] = fill_char;
11706 } else {
11707 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
11708 assert(kind == PyUnicode_4BYTE_KIND);
11709 for (n = 0; n < len; ++n)
11710 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011711 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011712 }
11713 else {
11714 /* number of characters copied this far */
11715 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011716 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011717 char *to = (char *) PyUnicode_DATA(u);
11718 Py_MEMCPY(to, PyUnicode_DATA(str),
11719 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011720 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011721 n = (done <= nchars-done) ? done : nchars-done;
11722 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011723 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011724 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011725 }
11726
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011727 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011728 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011729}
11730
Alexander Belopolsky40018472011-02-26 01:02:56 +000011731PyObject *
11732PyUnicode_Replace(PyObject *obj,
11733 PyObject *subobj,
11734 PyObject *replobj,
11735 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011736{
11737 PyObject *self;
11738 PyObject *str1;
11739 PyObject *str2;
11740 PyObject *result;
11741
11742 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011743 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011744 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011745 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011746 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011747 Py_DECREF(self);
11748 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011749 }
11750 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011751 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011752 Py_DECREF(self);
11753 Py_DECREF(str1);
11754 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011755 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060011756 if (PyUnicode_READY(self) == -1 ||
11757 PyUnicode_READY(str1) == -1 ||
11758 PyUnicode_READY(str2) == -1)
11759 result = NULL;
11760 else
11761 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011762 Py_DECREF(self);
11763 Py_DECREF(str1);
11764 Py_DECREF(str2);
11765 return result;
11766}
11767
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011768PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011769 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011770\n\
11771Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011772old replaced by new. If the optional argument count is\n\
11773given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011774
11775static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011776unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011777{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011778 PyObject *str1;
11779 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011780 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011781 PyObject *result;
11782
Martin v. Löwis18e16552006-02-15 17:27:45 +000011783 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011784 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060011785 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011786 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011787 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011788 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011789 return NULL;
11790 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011791 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011792 Py_DECREF(str1);
11793 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011794 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060011795 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
11796 result = NULL;
11797 else
11798 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011799
11800 Py_DECREF(str1);
11801 Py_DECREF(str2);
11802 return result;
11803}
11804
Alexander Belopolsky40018472011-02-26 01:02:56 +000011805static PyObject *
11806unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011807{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011808 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011809 Py_ssize_t isize;
11810 Py_ssize_t osize, squote, dquote, i, o;
11811 Py_UCS4 max, quote;
11812 int ikind, okind;
11813 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011814
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011815 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011816 return NULL;
11817
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011818 isize = PyUnicode_GET_LENGTH(unicode);
11819 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011820
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011821 /* Compute length of output, quote characters, and
11822 maximum character */
11823 osize = 2; /* quotes */
11824 max = 127;
11825 squote = dquote = 0;
11826 ikind = PyUnicode_KIND(unicode);
11827 for (i = 0; i < isize; i++) {
11828 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11829 switch (ch) {
11830 case '\'': squote++; osize++; break;
11831 case '"': dquote++; osize++; break;
11832 case '\\': case '\t': case '\r': case '\n':
11833 osize += 2; break;
11834 default:
11835 /* Fast-path ASCII */
11836 if (ch < ' ' || ch == 0x7f)
11837 osize += 4; /* \xHH */
11838 else if (ch < 0x7f)
11839 osize++;
11840 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11841 osize++;
11842 max = ch > max ? ch : max;
11843 }
11844 else if (ch < 0x100)
11845 osize += 4; /* \xHH */
11846 else if (ch < 0x10000)
11847 osize += 6; /* \uHHHH */
11848 else
11849 osize += 10; /* \uHHHHHHHH */
11850 }
11851 }
11852
11853 quote = '\'';
11854 if (squote) {
11855 if (dquote)
11856 /* Both squote and dquote present. Use squote,
11857 and escape them */
11858 osize += squote;
11859 else
11860 quote = '"';
11861 }
11862
11863 repr = PyUnicode_New(osize, max);
11864 if (repr == NULL)
11865 return NULL;
11866 okind = PyUnicode_KIND(repr);
11867 odata = PyUnicode_DATA(repr);
11868
11869 PyUnicode_WRITE(okind, odata, 0, quote);
11870 PyUnicode_WRITE(okind, odata, osize-1, quote);
11871
11872 for (i = 0, o = 1; i < isize; i++) {
11873 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011874
11875 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011876 if ((ch == quote) || (ch == '\\')) {
11877 PyUnicode_WRITE(okind, odata, o++, '\\');
11878 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011879 continue;
11880 }
11881
Benjamin Peterson29060642009-01-31 22:14:21 +000011882 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011883 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011884 PyUnicode_WRITE(okind, odata, o++, '\\');
11885 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011886 }
11887 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011888 PyUnicode_WRITE(okind, odata, o++, '\\');
11889 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011890 }
11891 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011892 PyUnicode_WRITE(okind, odata, o++, '\\');
11893 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011894 }
11895
11896 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011897 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011898 PyUnicode_WRITE(okind, odata, o++, '\\');
11899 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011900 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
11901 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011902 }
11903
Georg Brandl559e5d72008-06-11 18:37:52 +000011904 /* Copy ASCII characters as-is */
11905 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011906 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011907 }
11908
Benjamin Peterson29060642009-01-31 22:14:21 +000011909 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011910 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011911 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011912 (categories Z* and C* except ASCII space)
11913 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011914 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000011915 PyUnicode_WRITE(okind, odata, o++, '\\');
Georg Brandl559e5d72008-06-11 18:37:52 +000011916 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011917 if (ch <= 0xff) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011918 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011919 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
11920 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011921 }
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000011922 /* Map 16-bit characters to '\uxxxx' */
11923 else if (ch <= 0xffff) {
11924 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011925 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
11926 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
11927 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
11928 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011929 }
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000011930 /* Map 21-bit characters to '\U00xxxxxx' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011931 else {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000011932 PyUnicode_WRITE(okind, odata, o++, 'U');
11933 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
11934 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
11935 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
11936 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
Victor Stinnerf5cff562011-10-14 02:13:11 +020011937 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
11938 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
11939 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
11940 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011941 }
11942 }
11943 /* Copy characters as-is */
11944 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011945 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011946 }
11947 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000011948 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011949 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020011950 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000011951 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011952}
11953
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011954PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011955 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011956\n\
11957Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011958such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011959arguments start and end are interpreted as in slice notation.\n\
11960\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011961Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011962
11963static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011964unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011965{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011966 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011967 Py_ssize_t start;
11968 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011969 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011970
Jesus Ceaac451502011-04-20 17:09:23 +020011971 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11972 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011973 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011974
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011975 if (PyUnicode_READY(self) == -1)
11976 return NULL;
11977 if (PyUnicode_READY(substring) == -1)
11978 return NULL;
11979
Victor Stinner7931d9a2011-11-04 00:22:48 +010011980 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011981
11982 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011983
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011984 if (result == -2)
11985 return NULL;
11986
Christian Heimes217cfd12007-12-02 14:31:20 +000011987 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011988}
11989
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011990PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011991 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011992\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011993Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011994
11995static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011996unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011997{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011998 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011999 Py_ssize_t start;
12000 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012001 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012002
Jesus Ceaac451502011-04-20 17:09:23 +020012003 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12004 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012005 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012006
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012007 if (PyUnicode_READY(self) == -1)
12008 return NULL;
12009 if (PyUnicode_READY(substring) == -1)
12010 return NULL;
12011
Victor Stinner7931d9a2011-11-04 00:22:48 +010012012 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012013
12014 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012015
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012016 if (result == -2)
12017 return NULL;
12018
Guido van Rossumd57fd912000-03-10 22:53:23 +000012019 if (result < 0) {
12020 PyErr_SetString(PyExc_ValueError, "substring not found");
12021 return NULL;
12022 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012023
Christian Heimes217cfd12007-12-02 14:31:20 +000012024 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012025}
12026
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012027PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012028 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012029\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012030Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012031done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012032
12033static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012034unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012035{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012036 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012037 Py_UCS4 fillchar = ' ';
12038
Victor Stinnere9a29352011-10-01 02:14:59 +020012039 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012040 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012041
Benjamin Petersonbac79492012-01-14 13:34:47 -050012042 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012043 return NULL;
12044
Victor Stinnerc4b49542011-12-11 22:44:26 +010012045 if (PyUnicode_GET_LENGTH(self) >= width)
12046 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012047
Victor Stinnerc4b49542011-12-11 22:44:26 +010012048 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012049}
12050
Alexander Belopolsky40018472011-02-26 01:02:56 +000012051PyObject *
12052PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012053{
12054 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012055
Guido van Rossumd57fd912000-03-10 22:53:23 +000012056 s = PyUnicode_FromObject(s);
12057 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012058 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012059 if (sep != NULL) {
12060 sep = PyUnicode_FromObject(sep);
12061 if (sep == NULL) {
12062 Py_DECREF(s);
12063 return NULL;
12064 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012065 }
12066
Victor Stinner9310abb2011-10-05 00:59:23 +020012067 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012068
12069 Py_DECREF(s);
12070 Py_XDECREF(sep);
12071 return result;
12072}
12073
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012074PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012075 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012076\n\
12077Return a list of the words in S, using sep as the\n\
12078delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012079splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012080whitespace string is a separator and empty strings are\n\
12081removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012082
12083static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012084unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012085{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012086 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012087 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012088 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012089
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012090 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12091 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012092 return NULL;
12093
12094 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012095 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012096 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012097 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012098 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012099 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012100}
12101
Thomas Wouters477c8d52006-05-27 19:21:47 +000012102PyObject *
12103PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12104{
12105 PyObject* str_obj;
12106 PyObject* sep_obj;
12107 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012108 int kind1, kind2, kind;
12109 void *buf1 = NULL, *buf2 = NULL;
12110 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012111
12112 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012113 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012114 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012115 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012116 if (!sep_obj) {
12117 Py_DECREF(str_obj);
12118 return NULL;
12119 }
12120 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12121 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012122 Py_DECREF(str_obj);
12123 return NULL;
12124 }
12125
Victor Stinner14f8f022011-10-05 20:58:25 +020012126 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012127 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012128 kind = Py_MAX(kind1, kind2);
12129 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012130 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012131 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012132 if (!buf1)
12133 goto onError;
12134 buf2 = PyUnicode_DATA(sep_obj);
12135 if (kind2 != kind)
12136 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12137 if (!buf2)
12138 goto onError;
12139 len1 = PyUnicode_GET_LENGTH(str_obj);
12140 len2 = PyUnicode_GET_LENGTH(sep_obj);
12141
Benjamin Petersonead6b532011-12-20 17:23:42 -060012142 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012143 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012144 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12145 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12146 else
12147 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012148 break;
12149 case PyUnicode_2BYTE_KIND:
12150 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12151 break;
12152 case PyUnicode_4BYTE_KIND:
12153 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12154 break;
12155 default:
12156 assert(0);
12157 out = 0;
12158 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012159
12160 Py_DECREF(sep_obj);
12161 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012162 if (kind1 != kind)
12163 PyMem_Free(buf1);
12164 if (kind2 != kind)
12165 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012166
12167 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012168 onError:
12169 Py_DECREF(sep_obj);
12170 Py_DECREF(str_obj);
12171 if (kind1 != kind && buf1)
12172 PyMem_Free(buf1);
12173 if (kind2 != kind && buf2)
12174 PyMem_Free(buf2);
12175 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012176}
12177
12178
12179PyObject *
12180PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12181{
12182 PyObject* str_obj;
12183 PyObject* sep_obj;
12184 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012185 int kind1, kind2, kind;
12186 void *buf1 = NULL, *buf2 = NULL;
12187 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012188
12189 str_obj = PyUnicode_FromObject(str_in);
12190 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012191 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012192 sep_obj = PyUnicode_FromObject(sep_in);
12193 if (!sep_obj) {
12194 Py_DECREF(str_obj);
12195 return NULL;
12196 }
12197
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012198 kind1 = PyUnicode_KIND(str_in);
12199 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012200 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012201 buf1 = PyUnicode_DATA(str_in);
12202 if (kind1 != kind)
12203 buf1 = _PyUnicode_AsKind(str_in, kind);
12204 if (!buf1)
12205 goto onError;
12206 buf2 = PyUnicode_DATA(sep_obj);
12207 if (kind2 != kind)
12208 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12209 if (!buf2)
12210 goto onError;
12211 len1 = PyUnicode_GET_LENGTH(str_obj);
12212 len2 = PyUnicode_GET_LENGTH(sep_obj);
12213
Benjamin Petersonead6b532011-12-20 17:23:42 -060012214 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012215 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012216 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12217 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12218 else
12219 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012220 break;
12221 case PyUnicode_2BYTE_KIND:
12222 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12223 break;
12224 case PyUnicode_4BYTE_KIND:
12225 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12226 break;
12227 default:
12228 assert(0);
12229 out = 0;
12230 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012231
12232 Py_DECREF(sep_obj);
12233 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012234 if (kind1 != kind)
12235 PyMem_Free(buf1);
12236 if (kind2 != kind)
12237 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012238
12239 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012240 onError:
12241 Py_DECREF(sep_obj);
12242 Py_DECREF(str_obj);
12243 if (kind1 != kind && buf1)
12244 PyMem_Free(buf1);
12245 if (kind2 != kind && buf2)
12246 PyMem_Free(buf2);
12247 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012248}
12249
12250PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012251 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012252\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012253Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012254the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012255found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012256
12257static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012258unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012259{
Victor Stinner9310abb2011-10-05 00:59:23 +020012260 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012261}
12262
12263PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012264 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012265\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012266Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012267the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012268separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012269
12270static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012271unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012272{
Victor Stinner9310abb2011-10-05 00:59:23 +020012273 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012274}
12275
Alexander Belopolsky40018472011-02-26 01:02:56 +000012276PyObject *
12277PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012278{
12279 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012280
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012281 s = PyUnicode_FromObject(s);
12282 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012283 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012284 if (sep != NULL) {
12285 sep = PyUnicode_FromObject(sep);
12286 if (sep == NULL) {
12287 Py_DECREF(s);
12288 return NULL;
12289 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012290 }
12291
Victor Stinner9310abb2011-10-05 00:59:23 +020012292 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012293
12294 Py_DECREF(s);
12295 Py_XDECREF(sep);
12296 return result;
12297}
12298
12299PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012300 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012301\n\
12302Return a list of the words in S, using sep as the\n\
12303delimiter string, starting at the end of the string and\n\
12304working to the front. If maxsplit is given, at most maxsplit\n\
12305splits are done. If sep is not specified, any whitespace string\n\
12306is a separator.");
12307
12308static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012309unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012310{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012311 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012312 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012313 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012314
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012315 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12316 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012317 return NULL;
12318
12319 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012320 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012321 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012322 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012323 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012324 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012325}
12326
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012327PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012328 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012329\n\
12330Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012331Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012332is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012333
12334static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012335unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012336{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012337 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012338 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012339
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012340 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12341 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012342 return NULL;
12343
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012344 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012345}
12346
12347static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012348PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012349{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012350 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012351}
12352
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012353PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012354 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012355\n\
12356Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012357and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012358
12359static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012360unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012361{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012362 if (PyUnicode_READY(self) == -1)
12363 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012364 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012365}
12366
Georg Brandlceee0772007-11-27 23:48:05 +000012367PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012368 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012369\n\
12370Return a translation table usable for str.translate().\n\
12371If there is only one argument, it must be a dictionary mapping Unicode\n\
12372ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012373Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012374If there are two arguments, they must be strings of equal length, and\n\
12375in the resulting dictionary, each character in x will be mapped to the\n\
12376character at the same position in y. If there is a third argument, it\n\
12377must be a string, whose characters will be mapped to None in the result.");
12378
12379static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012380unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012381{
12382 PyObject *x, *y = NULL, *z = NULL;
12383 PyObject *new = NULL, *key, *value;
12384 Py_ssize_t i = 0;
12385 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012386
Georg Brandlceee0772007-11-27 23:48:05 +000012387 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12388 return NULL;
12389 new = PyDict_New();
12390 if (!new)
12391 return NULL;
12392 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012393 int x_kind, y_kind, z_kind;
12394 void *x_data, *y_data, *z_data;
12395
Georg Brandlceee0772007-11-27 23:48:05 +000012396 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012397 if (!PyUnicode_Check(x)) {
12398 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12399 "be a string if there is a second argument");
12400 goto err;
12401 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012402 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012403 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12404 "arguments must have equal length");
12405 goto err;
12406 }
12407 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012408 x_kind = PyUnicode_KIND(x);
12409 y_kind = PyUnicode_KIND(y);
12410 x_data = PyUnicode_DATA(x);
12411 y_data = PyUnicode_DATA(y);
12412 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12413 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012414 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012415 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012416 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012417 if (!value) {
12418 Py_DECREF(key);
12419 goto err;
12420 }
Georg Brandlceee0772007-11-27 23:48:05 +000012421 res = PyDict_SetItem(new, key, value);
12422 Py_DECREF(key);
12423 Py_DECREF(value);
12424 if (res < 0)
12425 goto err;
12426 }
12427 /* create entries for deleting chars in z */
12428 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012429 z_kind = PyUnicode_KIND(z);
12430 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012431 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012432 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012433 if (!key)
12434 goto err;
12435 res = PyDict_SetItem(new, key, Py_None);
12436 Py_DECREF(key);
12437 if (res < 0)
12438 goto err;
12439 }
12440 }
12441 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012442 int kind;
12443 void *data;
12444
Georg Brandlceee0772007-11-27 23:48:05 +000012445 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012446 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012447 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12448 "to maketrans it must be a dict");
12449 goto err;
12450 }
12451 /* copy entries into the new dict, converting string keys to int keys */
12452 while (PyDict_Next(x, &i, &key, &value)) {
12453 if (PyUnicode_Check(key)) {
12454 /* convert string keys to integer keys */
12455 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012456 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012457 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12458 "table must be of length 1");
12459 goto err;
12460 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012461 kind = PyUnicode_KIND(key);
12462 data = PyUnicode_DATA(key);
12463 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012464 if (!newkey)
12465 goto err;
12466 res = PyDict_SetItem(new, newkey, value);
12467 Py_DECREF(newkey);
12468 if (res < 0)
12469 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012470 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012471 /* just keep integer keys */
12472 if (PyDict_SetItem(new, key, value) < 0)
12473 goto err;
12474 } else {
12475 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12476 "be strings or integers");
12477 goto err;
12478 }
12479 }
12480 }
12481 return new;
12482 err:
12483 Py_DECREF(new);
12484 return NULL;
12485}
12486
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012487PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012488 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012489\n\
12490Return a copy of the string S, where all characters have been mapped\n\
12491through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012492Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012493Unmapped characters are left untouched. Characters mapped to None\n\
12494are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012495
12496static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012497unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012498{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012499 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012500}
12501
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012502PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012503 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012504\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012505Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012506
12507static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012508unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012509{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012510 if (PyUnicode_READY(self) == -1)
12511 return NULL;
12512 if (PyUnicode_IS_ASCII(self))
12513 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012514 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012515}
12516
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012517PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012518 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012519\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012520Pad a numeric string S with zeros on the left, to fill a field\n\
12521of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012522
12523static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012524unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012525{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012526 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012527 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012528 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012529 int kind;
12530 void *data;
12531 Py_UCS4 chr;
12532
Martin v. Löwis18e16552006-02-15 17:27:45 +000012533 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012534 return NULL;
12535
Benjamin Petersonbac79492012-01-14 13:34:47 -050012536 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012537 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012538
Victor Stinnerc4b49542011-12-11 22:44:26 +010012539 if (PyUnicode_GET_LENGTH(self) >= width)
12540 return unicode_result_unchanged(self);
12541
12542 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012543
12544 u = pad(self, fill, 0, '0');
12545
Walter Dörwald068325e2002-04-15 13:36:47 +000012546 if (u == NULL)
12547 return NULL;
12548
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012549 kind = PyUnicode_KIND(u);
12550 data = PyUnicode_DATA(u);
12551 chr = PyUnicode_READ(kind, data, fill);
12552
12553 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012554 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012555 PyUnicode_WRITE(kind, data, 0, chr);
12556 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012557 }
12558
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012559 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012560 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012561}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012562
12563#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012564static PyObject *
12565unicode__decimal2ascii(PyObject *self)
12566{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012567 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012568}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012569#endif
12570
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012571PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012572 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012573\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012574Return True if S starts with the specified prefix, False otherwise.\n\
12575With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012576With optional end, stop comparing S at that position.\n\
12577prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012578
12579static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012580unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012581 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012582{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012583 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012584 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012585 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012586 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012587 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012588
Jesus Ceaac451502011-04-20 17:09:23 +020012589 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012590 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012591 if (PyTuple_Check(subobj)) {
12592 Py_ssize_t i;
12593 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012594 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012595 if (substring == NULL)
12596 return NULL;
12597 result = tailmatch(self, substring, start, end, -1);
12598 Py_DECREF(substring);
12599 if (result) {
12600 Py_RETURN_TRUE;
12601 }
12602 }
12603 /* nothing matched */
12604 Py_RETURN_FALSE;
12605 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012606 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012607 if (substring == NULL) {
12608 if (PyErr_ExceptionMatches(PyExc_TypeError))
12609 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12610 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012611 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012612 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012613 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012614 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012615 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012616}
12617
12618
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012619PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012620 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012621\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012622Return True if S ends with the specified suffix, False otherwise.\n\
12623With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012624With optional end, stop comparing S at that position.\n\
12625suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012626
12627static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012628unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012629 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012630{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012631 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012632 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012633 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012634 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012635 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012636
Jesus Ceaac451502011-04-20 17:09:23 +020012637 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012638 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012639 if (PyTuple_Check(subobj)) {
12640 Py_ssize_t i;
12641 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012642 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012643 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012644 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012645 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012646 result = tailmatch(self, substring, start, end, +1);
12647 Py_DECREF(substring);
12648 if (result) {
12649 Py_RETURN_TRUE;
12650 }
12651 }
12652 Py_RETURN_FALSE;
12653 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012654 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012655 if (substring == NULL) {
12656 if (PyErr_ExceptionMatches(PyExc_TypeError))
12657 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12658 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012659 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012660 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012661 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012662 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012663 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012664}
12665
Victor Stinner202fdca2012-05-07 12:47:02 +020012666Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012667_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012668{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012669 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012670 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
12671 writer->data = PyUnicode_DATA(writer->buffer);
12672 writer->kind = PyUnicode_KIND(writer->buffer);
12673}
12674
Victor Stinnerd3f08822012-05-29 12:57:52 +020012675void
12676_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length)
Victor Stinner202fdca2012-05-07 12:47:02 +020012677{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012678 memset(writer, 0, sizeof(*writer));
12679#ifdef Py_DEBUG
12680 writer->kind = 5; /* invalid kind */
12681#endif
12682 writer->min_length = Py_MAX(min_length, 100);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012683 writer->overallocate = (min_length > 0);
Victor Stinner202fdca2012-05-07 12:47:02 +020012684}
12685
Victor Stinnerd3f08822012-05-29 12:57:52 +020012686int
12687_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
12688 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020012689{
12690 Py_ssize_t newlen;
12691 PyObject *newbuffer;
12692
Victor Stinnerd3f08822012-05-29 12:57:52 +020012693 assert(length > 0);
12694
Victor Stinner202fdca2012-05-07 12:47:02 +020012695 if (length > PY_SSIZE_T_MAX - writer->pos) {
12696 PyErr_NoMemory();
12697 return -1;
12698 }
12699 newlen = writer->pos + length;
12700
Victor Stinnerd3f08822012-05-29 12:57:52 +020012701 if (writer->buffer == NULL) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012702 if (writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012703 /* overallocate 25% to limit the number of resize */
12704 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12705 newlen += newlen / 4;
12706 if (newlen < writer->min_length)
12707 newlen = writer->min_length;
12708 }
12709 writer->buffer = PyUnicode_New(newlen, maxchar);
12710 if (writer->buffer == NULL)
12711 return -1;
12712 _PyUnicodeWriter_Update(writer);
12713 return 0;
12714 }
Victor Stinner202fdca2012-05-07 12:47:02 +020012715
Victor Stinnerd3f08822012-05-29 12:57:52 +020012716 if (newlen > writer->size) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012717 if (writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012718 /* overallocate 25% to limit the number of resize */
12719 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12720 newlen += newlen / 4;
12721 if (newlen < writer->min_length)
12722 newlen = writer->min_length;
12723 }
12724
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012725 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020012726 /* resize + widen */
12727 newbuffer = PyUnicode_New(newlen, maxchar);
12728 if (newbuffer == NULL)
12729 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012730 _PyUnicode_FastCopyCharacters(newbuffer, 0,
12731 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020012732 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012733 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020012734 }
12735 else {
12736 newbuffer = resize_compact(writer->buffer, newlen);
12737 if (newbuffer == NULL)
12738 return -1;
12739 }
12740 writer->buffer = newbuffer;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012741 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012742 }
12743 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012744 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012745 newbuffer = PyUnicode_New(writer->size, maxchar);
12746 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020012747 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012748 _PyUnicode_FastCopyCharacters(newbuffer, 0,
12749 writer->buffer, 0, writer->pos);
12750 Py_DECREF(writer->buffer);
12751 writer->buffer = newbuffer;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012752 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012753 }
12754 return 0;
12755}
12756
Victor Stinnerd3f08822012-05-29 12:57:52 +020012757int
12758_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
12759{
12760 Py_UCS4 maxchar;
12761 Py_ssize_t len;
12762
12763 if (PyUnicode_READY(str) == -1)
12764 return -1;
12765 len = PyUnicode_GET_LENGTH(str);
12766 if (len == 0)
12767 return 0;
12768 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
12769 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012770 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012771 Py_INCREF(str);
12772 writer->buffer = str;
12773 _PyUnicodeWriter_Update(writer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012774 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012775 writer->size = 0;
12776 writer->pos += len;
12777 return 0;
12778 }
12779 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
12780 return -1;
12781 }
12782 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
12783 str, 0, len);
12784 writer->pos += len;
12785 return 0;
12786}
12787
Victor Stinnere215d962012-10-06 23:03:36 +020012788int
12789_PyUnicodeWriter_WriteCstr(_PyUnicodeWriter *writer, const char *str, Py_ssize_t len)
12790{
12791 Py_UCS4 maxchar;
12792
12793 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
12794 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
12795 return -1;
12796 unicode_write_cstr(writer->buffer, writer->pos, str, len);
12797 writer->pos += len;
12798 return 0;
12799}
12800
Victor Stinnerd3f08822012-05-29 12:57:52 +020012801PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012802_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012803{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012804 if (writer->pos == 0) {
12805 Py_XDECREF(writer->buffer);
12806 Py_INCREF(unicode_empty);
12807 return unicode_empty;
12808 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012809 if (writer->readonly) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012810 assert(PyUnicode_GET_LENGTH(writer->buffer) == writer->pos);
12811 return writer->buffer;
12812 }
12813 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
12814 PyObject *newbuffer;
12815 newbuffer = resize_compact(writer->buffer, writer->pos);
12816 if (newbuffer == NULL) {
12817 Py_DECREF(writer->buffer);
12818 return NULL;
12819 }
12820 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020012821 }
Victor Stinnerf59c28c2012-05-09 03:24:14 +020012822 assert(_PyUnicode_CheckConsistency(writer->buffer, 1));
Victor Stinner202fdca2012-05-07 12:47:02 +020012823 return writer->buffer;
12824}
12825
Victor Stinnerd3f08822012-05-29 12:57:52 +020012826void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012827_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012828{
12829 Py_CLEAR(writer->buffer);
12830}
12831
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012832#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012833
12834PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012835 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012836\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012837Return a formatted version of S, using substitutions from args and kwargs.\n\
12838The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012839
Eric Smith27bbca62010-11-04 17:06:58 +000012840PyDoc_STRVAR(format_map__doc__,
12841 "S.format_map(mapping) -> str\n\
12842\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012843Return a formatted version of S, using substitutions from mapping.\n\
12844The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012845
Eric Smith4a7d76d2008-05-30 18:10:19 +000012846static PyObject *
12847unicode__format__(PyObject* self, PyObject* args)
12848{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012849 PyObject *format_spec;
12850 _PyUnicodeWriter writer;
12851 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012852
12853 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12854 return NULL;
12855
Victor Stinnerd3f08822012-05-29 12:57:52 +020012856 if (PyUnicode_READY(self) == -1)
12857 return NULL;
12858 _PyUnicodeWriter_Init(&writer, 0);
12859 ret = _PyUnicode_FormatAdvancedWriter(&writer,
12860 self, format_spec, 0,
12861 PyUnicode_GET_LENGTH(format_spec));
12862 if (ret == -1) {
12863 _PyUnicodeWriter_Dealloc(&writer);
12864 return NULL;
12865 }
12866 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000012867}
12868
Eric Smith8c663262007-08-25 02:26:07 +000012869PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012870 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012871\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012872Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012873
12874static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012875unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012876{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012877 Py_ssize_t size;
12878
12879 /* If it's a compact object, account for base structure +
12880 character data. */
12881 if (PyUnicode_IS_COMPACT_ASCII(v))
12882 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12883 else if (PyUnicode_IS_COMPACT(v))
12884 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012885 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012886 else {
12887 /* If it is a two-block object, account for base object, and
12888 for character block if present. */
12889 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012890 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012891 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012892 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012893 }
12894 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012895 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012896 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012897 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012898 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012899 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012900
12901 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012902}
12903
12904PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012905 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012906
12907static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012908unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012909{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010012910 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012911 if (!copy)
12912 return NULL;
12913 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012914}
12915
Guido van Rossumd57fd912000-03-10 22:53:23 +000012916static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012917 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012918 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012919 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
12920 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012921 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12922 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050012923 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012924 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12925 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12926 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12927 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12928 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012929 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012930 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12931 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12932 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012933 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012934 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12935 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12936 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012937 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012938 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012939 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012940 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012941 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12942 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12943 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12944 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12945 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12946 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12947 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12948 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12949 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12950 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12951 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12952 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12953 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12954 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012955 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012956 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012957 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012958 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012959 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012960 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012961 {"maketrans", (PyCFunction) unicode_maketrans,
12962 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012963 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012964#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012965 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012966 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012967#endif
12968
Benjamin Peterson14339b62009-01-31 16:36:08 +000012969 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012970 {NULL, NULL}
12971};
12972
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012973static PyObject *
12974unicode_mod(PyObject *v, PyObject *w)
12975{
Brian Curtindfc80e32011-08-10 20:28:54 -050012976 if (!PyUnicode_Check(v))
12977 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012978 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012979}
12980
12981static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012982 0, /*nb_add*/
12983 0, /*nb_subtract*/
12984 0, /*nb_multiply*/
12985 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012986};
12987
Guido van Rossumd57fd912000-03-10 22:53:23 +000012988static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012989 (lenfunc) unicode_length, /* sq_length */
12990 PyUnicode_Concat, /* sq_concat */
12991 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12992 (ssizeargfunc) unicode_getitem, /* sq_item */
12993 0, /* sq_slice */
12994 0, /* sq_ass_item */
12995 0, /* sq_ass_slice */
12996 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012997};
12998
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012999static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013000unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013001{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013002 if (PyUnicode_READY(self) == -1)
13003 return NULL;
13004
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013005 if (PyIndex_Check(item)) {
13006 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013007 if (i == -1 && PyErr_Occurred())
13008 return NULL;
13009 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013010 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013011 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013012 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013013 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013014 PyObject *result;
13015 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013016 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013017 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013018
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013019 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013020 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013021 return NULL;
13022 }
13023
13024 if (slicelength <= 0) {
Victor Stinner382955f2011-12-11 21:44:00 +010013025 Py_INCREF(unicode_empty);
13026 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013027 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013028 slicelength == PyUnicode_GET_LENGTH(self)) {
13029 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013030 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013031 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013032 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013033 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013034 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013035 src_kind = PyUnicode_KIND(self);
13036 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013037 if (!PyUnicode_IS_ASCII(self)) {
13038 kind_limit = kind_maxchar_limit(src_kind);
13039 max_char = 0;
13040 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13041 ch = PyUnicode_READ(src_kind, src_data, cur);
13042 if (ch > max_char) {
13043 max_char = ch;
13044 if (max_char >= kind_limit)
13045 break;
13046 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013047 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013048 }
Victor Stinner55c99112011-10-13 01:17:06 +020013049 else
13050 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013051 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013052 if (result == NULL)
13053 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013054 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013055 dest_data = PyUnicode_DATA(result);
13056
13057 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013058 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13059 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013060 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013061 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013062 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013063 } else {
13064 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13065 return NULL;
13066 }
13067}
13068
13069static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013070 (lenfunc)unicode_length, /* mp_length */
13071 (binaryfunc)unicode_subscript, /* mp_subscript */
13072 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013073};
13074
Guido van Rossumd57fd912000-03-10 22:53:23 +000013075
Guido van Rossumd57fd912000-03-10 22:53:23 +000013076/* Helpers for PyUnicode_Format() */
13077
Victor Stinnera47082312012-10-04 02:19:54 +020013078struct unicode_formatter_t {
13079 PyObject *args;
13080 int args_owned;
13081 Py_ssize_t arglen, argidx;
13082 PyObject *dict;
13083
13084 enum PyUnicode_Kind fmtkind;
13085 Py_ssize_t fmtcnt, fmtpos;
13086 void *fmtdata;
13087 PyObject *fmtstr;
13088
13089 _PyUnicodeWriter writer;
13090};
13091
13092struct unicode_format_arg_t {
13093 Py_UCS4 ch;
13094 int flags;
13095 Py_ssize_t width;
13096 int prec;
13097 int sign;
13098};
13099
Guido van Rossumd57fd912000-03-10 22:53:23 +000013100static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020013101unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013102{
Victor Stinnera47082312012-10-04 02:19:54 +020013103 Py_ssize_t argidx = ctx->argidx;
13104
13105 if (argidx < ctx->arglen) {
13106 ctx->argidx++;
13107 if (ctx->arglen < 0)
13108 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000013109 else
Victor Stinnera47082312012-10-04 02:19:54 +020013110 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013111 }
13112 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013113 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013114 return NULL;
13115}
13116
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013117/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013118
Victor Stinnera47082312012-10-04 02:19:54 +020013119/* Format a float into the writer if the writer is not NULL, or into *p_output
13120 otherwise.
13121
13122 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020013123static int
Victor Stinnera47082312012-10-04 02:19:54 +020013124formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13125 PyObject **p_output,
13126 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013127{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013128 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013129 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013130 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020013131 int prec;
13132 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000013133
Guido van Rossumd57fd912000-03-10 22:53:23 +000013134 x = PyFloat_AsDouble(v);
13135 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013136 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013137
Victor Stinnera47082312012-10-04 02:19:54 +020013138 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013139 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013140 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013141
Victor Stinnera47082312012-10-04 02:19:54 +020013142 if (arg->flags & F_ALT)
13143 dtoa_flags = Py_DTSF_ALT;
13144 else
13145 dtoa_flags = 0;
13146 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013147 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013148 return -1;
13149 len = strlen(p);
13150 if (writer) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013151 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1) {
13152 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013153 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013154 }
Victor Stinner184252a2012-06-16 02:57:41 +020013155 unicode_write_cstr(writer->buffer, writer->pos, p, len);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013156 writer->pos += len;
13157 }
13158 else
13159 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013160 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013161 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013162}
13163
Victor Stinnerd0880d52012-04-27 23:40:13 +020013164/* formatlong() emulates the format codes d, u, o, x and X, and
13165 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13166 * Python's regular ints.
13167 * Return value: a new PyUnicodeObject*, or NULL if error.
13168 * The output string is of the form
13169 * "-"? ("0x" | "0X")? digit+
13170 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13171 * set in flags. The case of hex digits will be correct,
13172 * There will be at least prec digits, zero-filled on the left if
13173 * necessary to get that many.
13174 * val object to be converted
13175 * flags bitmask of format flags; only F_ALT is looked at
13176 * prec minimum number of digits; 0-fill on left if needed
13177 * type a character in [duoxX]; u acts the same as d
13178 *
13179 * CAUTION: o, x and X conversions on regular ints can never
13180 * produce a '-' sign, but can for Python's unbounded ints.
13181 */
Tim Peters38fd5b62000-09-21 05:43:11 +000013182static PyObject*
Victor Stinnera47082312012-10-04 02:19:54 +020013183formatlong(PyObject *val, struct unicode_format_arg_t *arg)
Tim Peters38fd5b62000-09-21 05:43:11 +000013184{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013185 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013186 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013187 Py_ssize_t i;
13188 int sign; /* 1 if '-', else 0 */
13189 int len; /* number of characters */
13190 Py_ssize_t llen;
13191 int numdigits; /* len == numnondigits + numdigits */
13192 int numnondigits = 0;
Victor Stinnera47082312012-10-04 02:19:54 +020013193 int prec = arg->prec;
13194 int type = arg->ch;
Tim Peters38fd5b62000-09-21 05:43:11 +000013195
Victor Stinnerd0880d52012-04-27 23:40:13 +020013196 /* Avoid exceeding SSIZE_T_MAX */
13197 if (prec > INT_MAX-3) {
13198 PyErr_SetString(PyExc_OverflowError,
13199 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013200 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013201 }
13202
13203 assert(PyLong_Check(val));
13204
13205 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020013206 default:
13207 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013208 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020013209 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020013210 case 'u':
13211 /* Special-case boolean: we want 0/1 */
Victor Stinnerb11d91d2012-04-28 00:25:34 +020013212 if (PyBool_Check(val))
13213 result = PyNumber_ToBase(val, 10);
13214 else
13215 result = Py_TYPE(val)->tp_str(val);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013216 break;
13217 case 'o':
13218 numnondigits = 2;
13219 result = PyNumber_ToBase(val, 8);
13220 break;
13221 case 'x':
13222 case 'X':
13223 numnondigits = 2;
13224 result = PyNumber_ToBase(val, 16);
13225 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013226 }
13227 if (!result)
13228 return NULL;
13229
13230 assert(unicode_modifiable(result));
13231 assert(PyUnicode_IS_READY(result));
13232 assert(PyUnicode_IS_ASCII(result));
13233
13234 /* To modify the string in-place, there can only be one reference. */
13235 if (Py_REFCNT(result) != 1) {
13236 PyErr_BadInternalCall();
13237 return NULL;
13238 }
13239 buf = PyUnicode_DATA(result);
13240 llen = PyUnicode_GET_LENGTH(result);
13241 if (llen > INT_MAX) {
13242 PyErr_SetString(PyExc_ValueError,
13243 "string too large in _PyBytes_FormatLong");
13244 return NULL;
13245 }
13246 len = (int)llen;
13247 sign = buf[0] == '-';
13248 numnondigits += sign;
13249 numdigits = len - numnondigits;
13250 assert(numdigits > 0);
13251
13252 /* Get rid of base marker unless F_ALT */
Victor Stinnera47082312012-10-04 02:19:54 +020013253 if (((arg->flags & F_ALT) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020013254 (type == 'o' || type == 'x' || type == 'X'))) {
13255 assert(buf[sign] == '0');
13256 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13257 buf[sign+1] == 'o');
13258 numnondigits -= 2;
13259 buf += 2;
13260 len -= 2;
13261 if (sign)
13262 buf[0] = '-';
13263 assert(len == numnondigits + numdigits);
13264 assert(numdigits > 0);
13265 }
13266
13267 /* Fill with leading zeroes to meet minimum width. */
13268 if (prec > numdigits) {
13269 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13270 numnondigits + prec);
13271 char *b1;
13272 if (!r1) {
13273 Py_DECREF(result);
13274 return NULL;
13275 }
13276 b1 = PyBytes_AS_STRING(r1);
13277 for (i = 0; i < numnondigits; ++i)
13278 *b1++ = *buf++;
13279 for (i = 0; i < prec - numdigits; i++)
13280 *b1++ = '0';
13281 for (i = 0; i < numdigits; i++)
13282 *b1++ = *buf++;
13283 *b1 = '\0';
13284 Py_DECREF(result);
13285 result = r1;
13286 buf = PyBytes_AS_STRING(result);
13287 len = numnondigits + prec;
13288 }
13289
13290 /* Fix up case for hex conversions. */
13291 if (type == 'X') {
13292 /* Need to convert all lower case letters to upper case.
13293 and need to convert 0x to 0X (and -0x to -0X). */
13294 for (i = 0; i < len; i++)
13295 if (buf[i] >= 'a' && buf[i] <= 'x')
13296 buf[i] -= 'a'-'A';
13297 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013298 if (!PyUnicode_Check(result)
13299 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020013300 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013301 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013302 Py_DECREF(result);
13303 result = unicode;
13304 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013305 else if (len != PyUnicode_GET_LENGTH(result)) {
13306 if (PyUnicode_Resize(&result, len) < 0)
13307 Py_CLEAR(result);
13308 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013309 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013310}
13311
Victor Stinner621ef3d2012-10-02 00:33:47 +020013312/* Format an integer.
13313 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020013314 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020013315 * -1 and raise an exception on error */
13316static int
Victor Stinnera47082312012-10-04 02:19:54 +020013317mainformatlong(PyObject *v,
13318 struct unicode_format_arg_t *arg,
13319 PyObject **p_output,
13320 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020013321{
13322 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020013323 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013324
13325 if (!PyNumber_Check(v))
13326 goto wrongtype;
13327
13328 if (!PyLong_Check(v)) {
13329 iobj = PyNumber_Long(v);
13330 if (iobj == NULL) {
13331 if (PyErr_ExceptionMatches(PyExc_TypeError))
13332 goto wrongtype;
13333 return -1;
13334 }
13335 assert(PyLong_Check(iobj));
13336 }
13337 else {
13338 iobj = v;
13339 Py_INCREF(iobj);
13340 }
13341
13342 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020013343 && arg->width == -1 && arg->prec == -1
13344 && !(arg->flags & (F_SIGN | F_BLANK))
13345 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020013346 {
13347 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020013348 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013349 int base;
13350
Victor Stinnera47082312012-10-04 02:19:54 +020013351 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020013352 {
13353 default:
13354 assert(0 && "'type' not in [diuoxX]");
13355 case 'd':
13356 case 'i':
13357 case 'u':
13358 base = 10;
13359 break;
13360 case 'o':
13361 base = 8;
13362 break;
13363 case 'x':
13364 case 'X':
13365 base = 16;
13366 break;
13367 }
13368
Victor Stinnerc89d28f2012-10-02 12:54:07 +020013369 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
13370 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013371 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020013372 }
13373 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013374 return 1;
13375 }
13376
Victor Stinnera47082312012-10-04 02:19:54 +020013377 res = formatlong(iobj, arg);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013378 Py_DECREF(iobj);
13379 if (res == NULL)
13380 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020013381 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013382 return 0;
13383
13384wrongtype:
13385 PyErr_Format(PyExc_TypeError,
13386 "%%%c format: a number is required, "
Victor Stinnera47082312012-10-04 02:19:54 +020013387 "not %.200s",
13388 type, Py_TYPE(v)->tp_name);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013389 return -1;
13390}
13391
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013392static Py_UCS4
13393formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013394{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013395 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013396 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013397 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013398 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013399 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013400 goto onError;
13401 }
13402 else {
13403 /* Integer input truncated to a character */
13404 long x;
13405 x = PyLong_AsLong(v);
13406 if (x == -1 && PyErr_Occurred())
13407 goto onError;
13408
Victor Stinner8faf8212011-12-08 22:14:11 +010013409 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013410 PyErr_SetString(PyExc_OverflowError,
13411 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013412 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013413 }
13414
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013415 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013416 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013417
Benjamin Peterson29060642009-01-31 22:14:21 +000013418 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013419 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013420 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013421 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013422}
13423
Victor Stinnera47082312012-10-04 02:19:54 +020013424/* Parse options of an argument: flags, width, precision.
13425 Handle also "%(name)" syntax.
13426
13427 Return 0 if the argument has been formatted into arg->str.
13428 Return 1 if the argument has been written into ctx->writer,
13429 Raise an exception and return -1 on error. */
13430static int
13431unicode_format_arg_parse(struct unicode_formatter_t *ctx,
13432 struct unicode_format_arg_t *arg)
13433{
13434#define FORMAT_READ(ctx) \
13435 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
13436
13437 PyObject *v;
13438
13439 arg->ch = FORMAT_READ(ctx);
13440 if (arg->ch == '(') {
13441 /* Get argument value from a dictionary. Example: "%(name)s". */
13442 Py_ssize_t keystart;
13443 Py_ssize_t keylen;
13444 PyObject *key;
13445 int pcount = 1;
13446
13447 if (ctx->dict == NULL) {
13448 PyErr_SetString(PyExc_TypeError,
13449 "format requires a mapping");
13450 return -1;
13451 }
13452 ++ctx->fmtpos;
13453 --ctx->fmtcnt;
13454 keystart = ctx->fmtpos;
13455 /* Skip over balanced parentheses */
13456 while (pcount > 0 && --ctx->fmtcnt >= 0) {
13457 arg->ch = FORMAT_READ(ctx);
13458 if (arg->ch == ')')
13459 --pcount;
13460 else if (arg->ch == '(')
13461 ++pcount;
13462 ctx->fmtpos++;
13463 }
13464 keylen = ctx->fmtpos - keystart - 1;
13465 if (ctx->fmtcnt < 0 || pcount > 0) {
13466 PyErr_SetString(PyExc_ValueError,
13467 "incomplete format key");
13468 return -1;
13469 }
13470 key = PyUnicode_Substring(ctx->fmtstr,
13471 keystart, keystart + keylen);
13472 if (key == NULL)
13473 return -1;
13474 if (ctx->args_owned) {
13475 Py_DECREF(ctx->args);
13476 ctx->args_owned = 0;
13477 }
13478 ctx->args = PyObject_GetItem(ctx->dict, key);
13479 Py_DECREF(key);
13480 if (ctx->args == NULL)
13481 return -1;
13482 ctx->args_owned = 1;
13483 ctx->arglen = -1;
13484 ctx->argidx = -2;
13485 }
13486
13487 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
13488 arg->flags = 0;
13489 while (--ctx->fmtcnt >= 0) {
13490 arg->ch = FORMAT_READ(ctx);
13491 ctx->fmtpos++;
13492 switch (arg->ch) {
13493 case '-': arg->flags |= F_LJUST; continue;
13494 case '+': arg->flags |= F_SIGN; continue;
13495 case ' ': arg->flags |= F_BLANK; continue;
13496 case '#': arg->flags |= F_ALT; continue;
13497 case '0': arg->flags |= F_ZERO; continue;
13498 }
13499 break;
13500 }
13501
13502 /* Parse width. Example: "%10s" => width=10 */
13503 arg->width = -1;
13504 if (arg->ch == '*') {
13505 v = unicode_format_getnextarg(ctx);
13506 if (v == NULL)
13507 return -1;
13508 if (!PyLong_Check(v)) {
13509 PyErr_SetString(PyExc_TypeError,
13510 "* wants int");
13511 return -1;
13512 }
13513 arg->width = PyLong_AsLong(v);
13514 if (arg->width == -1 && PyErr_Occurred())
13515 return -1;
13516 if (arg->width < 0) {
13517 arg->flags |= F_LJUST;
13518 arg->width = -arg->width;
13519 }
13520 if (--ctx->fmtcnt >= 0) {
13521 arg->ch = FORMAT_READ(ctx);
13522 ctx->fmtpos++;
13523 }
13524 }
13525 else if (arg->ch >= '0' && arg->ch <= '9') {
13526 arg->width = arg->ch - '0';
13527 while (--ctx->fmtcnt >= 0) {
13528 arg->ch = FORMAT_READ(ctx);
13529 ctx->fmtpos++;
13530 if (arg->ch < '0' || arg->ch > '9')
13531 break;
13532 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
13533 mixing signed and unsigned comparison. Since arg->ch is between
13534 '0' and '9', casting to int is safe. */
13535 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
13536 PyErr_SetString(PyExc_ValueError,
13537 "width too big");
13538 return -1;
13539 }
13540 arg->width = arg->width*10 + (arg->ch - '0');
13541 }
13542 }
13543
13544 /* Parse precision. Example: "%.3f" => prec=3 */
13545 arg->prec = -1;
13546 if (arg->ch == '.') {
13547 arg->prec = 0;
13548 if (--ctx->fmtcnt >= 0) {
13549 arg->ch = FORMAT_READ(ctx);
13550 ctx->fmtpos++;
13551 }
13552 if (arg->ch == '*') {
13553 v = unicode_format_getnextarg(ctx);
13554 if (v == NULL)
13555 return -1;
13556 if (!PyLong_Check(v)) {
13557 PyErr_SetString(PyExc_TypeError,
13558 "* wants int");
13559 return -1;
13560 }
13561 arg->prec = PyLong_AsLong(v);
13562 if (arg->prec == -1 && PyErr_Occurred())
13563 return -1;
13564 if (arg->prec < 0)
13565 arg->prec = 0;
13566 if (--ctx->fmtcnt >= 0) {
13567 arg->ch = FORMAT_READ(ctx);
13568 ctx->fmtpos++;
13569 }
13570 }
13571 else if (arg->ch >= '0' && arg->ch <= '9') {
13572 arg->prec = arg->ch - '0';
13573 while (--ctx->fmtcnt >= 0) {
13574 arg->ch = FORMAT_READ(ctx);
13575 ctx->fmtpos++;
13576 if (arg->ch < '0' || arg->ch > '9')
13577 break;
13578 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
13579 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020013580 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020013581 return -1;
13582 }
13583 arg->prec = arg->prec*10 + (arg->ch - '0');
13584 }
13585 }
13586 }
13587
13588 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
13589 if (ctx->fmtcnt >= 0) {
13590 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
13591 if (--ctx->fmtcnt >= 0) {
13592 arg->ch = FORMAT_READ(ctx);
13593 ctx->fmtpos++;
13594 }
13595 }
13596 }
13597 if (ctx->fmtcnt < 0) {
13598 PyErr_SetString(PyExc_ValueError,
13599 "incomplete format");
13600 return -1;
13601 }
13602 return 0;
13603
13604#undef FORMAT_READ
13605}
13606
13607/* Format one argument. Supported conversion specifiers:
13608
13609 - "s", "r", "a": any type
13610 - "i", "d", "u", "o", "x", "X": int
13611 - "e", "E", "f", "F", "g", "G": float
13612 - "c": int or str (1 character)
13613
13614 Return 0 if the argument has been formatted into *p_str,
13615 1 if the argument has been written into ctx->writer,
13616 -1 on error. */
13617static int
13618unicode_format_arg_format(struct unicode_formatter_t *ctx,
13619 struct unicode_format_arg_t *arg,
13620 PyObject **p_str)
13621{
13622 PyObject *v;
13623 _PyUnicodeWriter *writer = &ctx->writer;
13624
13625 if (ctx->fmtcnt == 0)
13626 ctx->writer.overallocate = 0;
13627
13628 if (arg->ch == '%') {
13629 if (_PyUnicodeWriter_Prepare(writer, 1, '%') == -1)
13630 return -1;
13631 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '%');
13632 writer->pos += 1;
13633 return 1;
13634 }
13635
13636 v = unicode_format_getnextarg(ctx);
13637 if (v == NULL)
13638 return -1;
13639
13640 arg->sign = 0;
13641
13642 switch (arg->ch) {
13643
13644 case 's':
13645 case 'r':
13646 case 'a':
13647 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
13648 /* Fast path */
13649 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
13650 return -1;
13651 return 1;
13652 }
13653
13654 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
13655 *p_str = v;
13656 Py_INCREF(*p_str);
13657 }
13658 else {
13659 if (arg->ch == 's')
13660 *p_str = PyObject_Str(v);
13661 else if (arg->ch == 'r')
13662 *p_str = PyObject_Repr(v);
13663 else
13664 *p_str = PyObject_ASCII(v);
13665 }
13666 break;
13667
13668 case 'i':
13669 case 'd':
13670 case 'u':
13671 case 'o':
13672 case 'x':
13673 case 'X':
13674 {
13675 int ret = mainformatlong(v, arg, p_str, writer);
13676 if (ret != 0)
13677 return ret;
13678 arg->sign = 1;
13679 break;
13680 }
13681
13682 case 'e':
13683 case 'E':
13684 case 'f':
13685 case 'F':
13686 case 'g':
13687 case 'G':
13688 if (arg->width == -1 && arg->prec == -1
13689 && !(arg->flags & (F_SIGN | F_BLANK)))
13690 {
13691 /* Fast path */
13692 if (formatfloat(v, arg, NULL, writer) == -1)
13693 return -1;
13694 return 1;
13695 }
13696
13697 arg->sign = 1;
13698 if (formatfloat(v, arg, p_str, NULL) == -1)
13699 return -1;
13700 break;
13701
13702 case 'c':
13703 {
13704 Py_UCS4 ch = formatchar(v);
13705 if (ch == (Py_UCS4) -1)
13706 return -1;
13707 if (arg->width == -1 && arg->prec == -1) {
13708 /* Fast path */
13709 if (_PyUnicodeWriter_Prepare(writer, 1, ch) == -1)
13710 return -1;
13711 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13712 writer->pos += 1;
13713 return 1;
13714 }
13715 *p_str = PyUnicode_FromOrdinal(ch);
13716 break;
13717 }
13718
13719 default:
13720 PyErr_Format(PyExc_ValueError,
13721 "unsupported format character '%c' (0x%x) "
13722 "at index %zd",
13723 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
13724 (int)arg->ch,
13725 ctx->fmtpos - 1);
13726 return -1;
13727 }
13728 if (*p_str == NULL)
13729 return -1;
13730 assert (PyUnicode_Check(*p_str));
13731 return 0;
13732}
13733
13734static int
13735unicode_format_arg_output(struct unicode_formatter_t *ctx,
13736 struct unicode_format_arg_t *arg,
13737 PyObject *str)
13738{
13739 Py_ssize_t len;
13740 enum PyUnicode_Kind kind;
13741 void *pbuf;
13742 Py_ssize_t pindex;
13743 Py_UCS4 signchar;
13744 Py_ssize_t buflen;
13745 Py_UCS4 maxchar, bufmaxchar;
13746 Py_ssize_t sublen;
13747 _PyUnicodeWriter *writer = &ctx->writer;
13748 Py_UCS4 fill;
13749
13750 fill = ' ';
13751 if (arg->sign && arg->flags & F_ZERO)
13752 fill = '0';
13753
13754 if (PyUnicode_READY(str) == -1)
13755 return -1;
13756
13757 len = PyUnicode_GET_LENGTH(str);
13758 if ((arg->width == -1 || arg->width <= len)
13759 && (arg->prec == -1 || arg->prec >= len)
13760 && !(arg->flags & (F_SIGN | F_BLANK)))
13761 {
13762 /* Fast path */
13763 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
13764 return -1;
13765 return 0;
13766 }
13767
13768 /* Truncate the string for "s", "r" and "a" formats
13769 if the precision is set */
13770 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
13771 if (arg->prec >= 0 && len > arg->prec)
13772 len = arg->prec;
13773 }
13774
13775 /* Adjust sign and width */
13776 kind = PyUnicode_KIND(str);
13777 pbuf = PyUnicode_DATA(str);
13778 pindex = 0;
13779 signchar = '\0';
13780 if (arg->sign) {
13781 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
13782 if (ch == '-' || ch == '+') {
13783 signchar = ch;
13784 len--;
13785 pindex++;
13786 }
13787 else if (arg->flags & F_SIGN)
13788 signchar = '+';
13789 else if (arg->flags & F_BLANK)
13790 signchar = ' ';
13791 else
13792 arg->sign = 0;
13793 }
13794 if (arg->width < len)
13795 arg->width = len;
13796
13797 /* Prepare the writer */
13798 bufmaxchar = 127;
13799 if (!(arg->flags & F_LJUST)) {
13800 if (arg->sign) {
13801 if ((arg->width-1) > len)
13802 bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill);
13803 }
13804 else {
13805 if (arg->width > len)
13806 bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill);
13807 }
13808 }
13809 maxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
13810 bufmaxchar = MAX_MAXCHAR(bufmaxchar, maxchar);
13811 buflen = arg->width;
13812 if (arg->sign && len == arg->width)
13813 buflen++;
13814 if (_PyUnicodeWriter_Prepare(writer, buflen, bufmaxchar) == -1)
13815 return -1;
13816
13817 /* Write the sign if needed */
13818 if (arg->sign) {
13819 if (fill != ' ') {
13820 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
13821 writer->pos += 1;
13822 }
13823 if (arg->width > len)
13824 arg->width--;
13825 }
13826
13827 /* Write the numeric prefix for "x", "X" and "o" formats
13828 if the alternate form is used.
13829 For example, write "0x" for the "%#x" format. */
13830 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
13831 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13832 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
13833 if (fill != ' ') {
13834 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
13835 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
13836 writer->pos += 2;
13837 pindex += 2;
13838 }
13839 arg->width -= 2;
13840 if (arg->width < 0)
13841 arg->width = 0;
13842 len -= 2;
13843 }
13844
13845 /* Pad left with the fill character if needed */
13846 if (arg->width > len && !(arg->flags & F_LJUST)) {
13847 sublen = arg->width - len;
13848 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
13849 writer->pos += sublen;
13850 arg->width = len;
13851 }
13852
13853 /* If padding with spaces: write sign if needed and/or numeric prefix if
13854 the alternate form is used */
13855 if (fill == ' ') {
13856 if (arg->sign) {
13857 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
13858 writer->pos += 1;
13859 }
13860 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
13861 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13862 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
13863 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
13864 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
13865 writer->pos += 2;
13866 pindex += 2;
13867 }
13868 }
13869
13870 /* Write characters */
13871 if (len) {
13872 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13873 str, pindex, len);
13874 writer->pos += len;
13875 }
13876
13877 /* Pad right with the fill character if needed */
13878 if (arg->width > len) {
13879 sublen = arg->width - len;
13880 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
13881 writer->pos += sublen;
13882 }
13883 return 0;
13884}
13885
13886/* Helper of PyUnicode_Format(): format one arg.
13887 Return 0 on success, raise an exception and return -1 on error. */
13888static int
13889unicode_format_arg(struct unicode_formatter_t *ctx)
13890{
13891 struct unicode_format_arg_t arg;
13892 PyObject *str;
13893 int ret;
13894
13895 ret = unicode_format_arg_parse(ctx, &arg);
13896 if (ret == -1)
13897 return -1;
13898
13899 ret = unicode_format_arg_format(ctx, &arg, &str);
13900 if (ret == -1)
13901 return -1;
13902
13903 if (ret != 1) {
13904 ret = unicode_format_arg_output(ctx, &arg, str);
13905 Py_DECREF(str);
13906 if (ret == -1)
13907 return -1;
13908 }
13909
13910 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
13911 PyErr_SetString(PyExc_TypeError,
13912 "not all arguments converted during string formatting");
13913 return -1;
13914 }
13915 return 0;
13916}
13917
Alexander Belopolsky40018472011-02-26 01:02:56 +000013918PyObject *
13919PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013920{
Victor Stinnera47082312012-10-04 02:19:54 +020013921 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000013922
Guido van Rossumd57fd912000-03-10 22:53:23 +000013923 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013924 PyErr_BadInternalCall();
13925 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013926 }
Victor Stinnera47082312012-10-04 02:19:54 +020013927
13928 ctx.fmtstr = PyUnicode_FromObject(format);
13929 if (ctx.fmtstr == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013930 return NULL;
Victor Stinnera47082312012-10-04 02:19:54 +020013931 if (PyUnicode_READY(ctx.fmtstr) == -1) {
13932 Py_DECREF(ctx.fmtstr);
13933 return NULL;
13934 }
13935 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
13936 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
13937 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
13938 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013939
Victor Stinnera47082312012-10-04 02:19:54 +020013940 _PyUnicodeWriter_Init(&ctx.writer, ctx.fmtcnt + 100);
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013941
Guido van Rossumd57fd912000-03-10 22:53:23 +000013942 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020013943 ctx.arglen = PyTuple_Size(args);
13944 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013945 }
13946 else {
Victor Stinnera47082312012-10-04 02:19:54 +020013947 ctx.arglen = -1;
13948 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013949 }
Victor Stinnera47082312012-10-04 02:19:54 +020013950 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040013951 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020013952 ctx.dict = args;
13953 else
13954 ctx.dict = NULL;
13955 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013956
Victor Stinnera47082312012-10-04 02:19:54 +020013957 while (--ctx.fmtcnt >= 0) {
13958 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
13959 Py_ssize_t nonfmtpos, sublen;
13960 Py_UCS4 maxchar;
13961
13962 nonfmtpos = ctx.fmtpos++;
13963 while (ctx.fmtcnt >= 0 &&
13964 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
13965 ctx.fmtpos++;
13966 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013967 }
Victor Stinnera47082312012-10-04 02:19:54 +020013968 if (ctx.fmtcnt < 0) {
13969 ctx.fmtpos--;
13970 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020013971 }
Victor Stinnera47082312012-10-04 02:19:54 +020013972 sublen = ctx.fmtpos - nonfmtpos;
13973 maxchar = _PyUnicode_FindMaxChar(ctx.fmtstr,
Victor Stinneree4544c2012-05-09 22:24:08 +020013974 nonfmtpos, nonfmtpos + sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020013975 if (_PyUnicodeWriter_Prepare(&ctx.writer, sublen, maxchar) == -1)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013976 goto onError;
Victor Stinneree4544c2012-05-09 22:24:08 +020013977
Victor Stinnera47082312012-10-04 02:19:54 +020013978 _PyUnicode_FastCopyCharacters(ctx.writer.buffer, ctx.writer.pos,
13979 ctx.fmtstr, nonfmtpos, sublen);
13980 ctx.writer.pos += sublen;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013981 }
13982 else {
Victor Stinnera47082312012-10-04 02:19:54 +020013983 ctx.fmtpos++;
13984 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013985 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020013986 }
13987 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020013988
Victor Stinnera47082312012-10-04 02:19:54 +020013989 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013990 PyErr_SetString(PyExc_TypeError,
13991 "not all arguments converted during string formatting");
13992 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013993 }
13994
Victor Stinnera47082312012-10-04 02:19:54 +020013995 if (ctx.args_owned) {
13996 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013997 }
Victor Stinnera47082312012-10-04 02:19:54 +020013998 Py_DECREF(ctx.fmtstr);
13999 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014000
Benjamin Peterson29060642009-01-31 22:14:21 +000014001 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014002 Py_DECREF(ctx.fmtstr);
14003 _PyUnicodeWriter_Dealloc(&ctx.writer);
14004 if (ctx.args_owned) {
14005 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014006 }
14007 return NULL;
14008}
14009
Jeremy Hylton938ace62002-07-17 16:30:39 +000014010static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014011unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14012
Tim Peters6d6c1a32001-08-02 04:15:00 +000014013static PyObject *
14014unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14015{
Benjamin Peterson29060642009-01-31 22:14:21 +000014016 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014017 static char *kwlist[] = {"object", "encoding", "errors", 0};
14018 char *encoding = NULL;
14019 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014020
Benjamin Peterson14339b62009-01-31 16:36:08 +000014021 if (type != &PyUnicode_Type)
14022 return unicode_subtype_new(type, args, kwds);
14023 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014024 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014025 return NULL;
Victor Stinner382955f2011-12-11 21:44:00 +010014026 if (x == NULL) {
14027 Py_INCREF(unicode_empty);
14028 return unicode_empty;
14029 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014030 if (encoding == NULL && errors == NULL)
14031 return PyObject_Str(x);
14032 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014033 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014034}
14035
Guido van Rossume023fe02001-08-30 03:12:59 +000014036static PyObject *
14037unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14038{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014039 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014040 Py_ssize_t length, char_size;
14041 int share_wstr, share_utf8;
14042 unsigned int kind;
14043 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014044
Benjamin Peterson14339b62009-01-31 16:36:08 +000014045 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014046
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014047 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014048 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014049 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014050 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014051 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014052 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014053 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014054 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014055
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014056 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014057 if (self == NULL) {
14058 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014059 return NULL;
14060 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014061 kind = PyUnicode_KIND(unicode);
14062 length = PyUnicode_GET_LENGTH(unicode);
14063
14064 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014065#ifdef Py_DEBUG
14066 _PyUnicode_HASH(self) = -1;
14067#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014068 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014069#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014070 _PyUnicode_STATE(self).interned = 0;
14071 _PyUnicode_STATE(self).kind = kind;
14072 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014073 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014074 _PyUnicode_STATE(self).ready = 1;
14075 _PyUnicode_WSTR(self) = NULL;
14076 _PyUnicode_UTF8_LENGTH(self) = 0;
14077 _PyUnicode_UTF8(self) = NULL;
14078 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014079 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014080
14081 share_utf8 = 0;
14082 share_wstr = 0;
14083 if (kind == PyUnicode_1BYTE_KIND) {
14084 char_size = 1;
14085 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14086 share_utf8 = 1;
14087 }
14088 else if (kind == PyUnicode_2BYTE_KIND) {
14089 char_size = 2;
14090 if (sizeof(wchar_t) == 2)
14091 share_wstr = 1;
14092 }
14093 else {
14094 assert(kind == PyUnicode_4BYTE_KIND);
14095 char_size = 4;
14096 if (sizeof(wchar_t) == 4)
14097 share_wstr = 1;
14098 }
14099
14100 /* Ensure we won't overflow the length. */
14101 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14102 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014103 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014104 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014105 data = PyObject_MALLOC((length + 1) * char_size);
14106 if (data == NULL) {
14107 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014108 goto onError;
14109 }
14110
Victor Stinnerc3c74152011-10-02 20:39:55 +020014111 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014112 if (share_utf8) {
14113 _PyUnicode_UTF8_LENGTH(self) = length;
14114 _PyUnicode_UTF8(self) = data;
14115 }
14116 if (share_wstr) {
14117 _PyUnicode_WSTR_LENGTH(self) = length;
14118 _PyUnicode_WSTR(self) = (wchar_t *)data;
14119 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014120
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014121 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014122 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014123 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014124#ifdef Py_DEBUG
14125 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14126#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014127 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014128 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014129
14130onError:
14131 Py_DECREF(unicode);
14132 Py_DECREF(self);
14133 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014134}
14135
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014136PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014137"str(object='') -> str\n\
14138str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014139\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014140Create a new string object from the given object. If encoding or\n\
14141errors is specified, then the object must expose a data buffer\n\
14142that will be decoded using the given encoding and error handler.\n\
14143Otherwise, returns the result of object.__str__() (if defined)\n\
14144or repr(object).\n\
14145encoding defaults to sys.getdefaultencoding().\n\
14146errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014147
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014148static PyObject *unicode_iter(PyObject *seq);
14149
Guido van Rossumd57fd912000-03-10 22:53:23 +000014150PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014151 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014152 "str", /* tp_name */
14153 sizeof(PyUnicodeObject), /* tp_size */
14154 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014155 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014156 (destructor)unicode_dealloc, /* tp_dealloc */
14157 0, /* tp_print */
14158 0, /* tp_getattr */
14159 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014160 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014161 unicode_repr, /* tp_repr */
14162 &unicode_as_number, /* tp_as_number */
14163 &unicode_as_sequence, /* tp_as_sequence */
14164 &unicode_as_mapping, /* tp_as_mapping */
14165 (hashfunc) unicode_hash, /* tp_hash*/
14166 0, /* tp_call*/
14167 (reprfunc) unicode_str, /* tp_str */
14168 PyObject_GenericGetAttr, /* tp_getattro */
14169 0, /* tp_setattro */
14170 0, /* tp_as_buffer */
14171 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014172 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014173 unicode_doc, /* tp_doc */
14174 0, /* tp_traverse */
14175 0, /* tp_clear */
14176 PyUnicode_RichCompare, /* tp_richcompare */
14177 0, /* tp_weaklistoffset */
14178 unicode_iter, /* tp_iter */
14179 0, /* tp_iternext */
14180 unicode_methods, /* tp_methods */
14181 0, /* tp_members */
14182 0, /* tp_getset */
14183 &PyBaseObject_Type, /* tp_base */
14184 0, /* tp_dict */
14185 0, /* tp_descr_get */
14186 0, /* tp_descr_set */
14187 0, /* tp_dictoffset */
14188 0, /* tp_init */
14189 0, /* tp_alloc */
14190 unicode_new, /* tp_new */
14191 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014192};
14193
14194/* Initialize the Unicode implementation */
14195
Victor Stinner3a50e702011-10-18 21:21:00 +020014196int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014197{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014198 int i;
14199
Thomas Wouters477c8d52006-05-27 19:21:47 +000014200 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014201 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014202 0x000A, /* LINE FEED */
14203 0x000D, /* CARRIAGE RETURN */
14204 0x001C, /* FILE SEPARATOR */
14205 0x001D, /* GROUP SEPARATOR */
14206 0x001E, /* RECORD SEPARATOR */
14207 0x0085, /* NEXT LINE */
14208 0x2028, /* LINE SEPARATOR */
14209 0x2029, /* PARAGRAPH SEPARATOR */
14210 };
14211
Fred Drakee4315f52000-05-09 19:53:39 +000014212 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020014213 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014214 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014215 Py_FatalError("Can't create empty string");
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010014216 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014217
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014218 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000014219 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000014220 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014221 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014222
14223 /* initialize the linebreak bloom filter */
14224 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014225 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014226 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014227
14228 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020014229
Benjamin Petersonc4311282012-10-30 23:21:10 -040014230 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
14231 Py_FatalError("Can't initialize field name iterator type");
14232
14233 if (PyType_Ready(&PyFormatterIter_Type) < 0)
14234 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040014235
Victor Stinner3a50e702011-10-18 21:21:00 +020014236#ifdef HAVE_MBCS
14237 winver.dwOSVersionInfoSize = sizeof(winver);
14238 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14239 PyErr_SetFromWindowsErr(0);
14240 return -1;
14241 }
14242#endif
14243 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014244}
14245
14246/* Finalize the Unicode implementation */
14247
Christian Heimesa156e092008-02-16 07:38:31 +000014248int
14249PyUnicode_ClearFreeList(void)
14250{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014251 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014252}
14253
Guido van Rossumd57fd912000-03-10 22:53:23 +000014254void
Thomas Wouters78890102000-07-22 19:25:51 +000014255_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014256{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014257 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014258
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000014259 Py_XDECREF(unicode_empty);
14260 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014261
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014262 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014263 if (unicode_latin1[i]) {
14264 Py_DECREF(unicode_latin1[i]);
14265 unicode_latin1[i] = NULL;
14266 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014267 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014268 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014269 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014270}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014271
Walter Dörwald16807132007-05-25 13:52:07 +000014272void
14273PyUnicode_InternInPlace(PyObject **p)
14274{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014275 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014276 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014277#ifdef Py_DEBUG
14278 assert(s != NULL);
14279 assert(_PyUnicode_CHECK(s));
14280#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014281 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014282 return;
14283#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014284 /* If it's a subclass, we don't really know what putting
14285 it in the interned dict might do. */
14286 if (!PyUnicode_CheckExact(s))
14287 return;
14288 if (PyUnicode_CHECK_INTERNED(s))
14289 return;
14290 if (interned == NULL) {
14291 interned = PyDict_New();
14292 if (interned == NULL) {
14293 PyErr_Clear(); /* Don't leave an exception */
14294 return;
14295 }
14296 }
14297 /* It might be that the GetItem call fails even
14298 though the key is present in the dictionary,
14299 namely when this happens during a stack overflow. */
14300 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014301 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014302 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014303
Benjamin Peterson29060642009-01-31 22:14:21 +000014304 if (t) {
14305 Py_INCREF(t);
14306 Py_DECREF(*p);
14307 *p = t;
14308 return;
14309 }
Walter Dörwald16807132007-05-25 13:52:07 +000014310
Benjamin Peterson14339b62009-01-31 16:36:08 +000014311 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014312 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014313 PyErr_Clear();
14314 PyThreadState_GET()->recursion_critical = 0;
14315 return;
14316 }
14317 PyThreadState_GET()->recursion_critical = 0;
14318 /* The two references in interned are not counted by refcnt.
14319 The deallocator will take care of this */
14320 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014321 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014322}
14323
14324void
14325PyUnicode_InternImmortal(PyObject **p)
14326{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014327 PyUnicode_InternInPlace(p);
14328 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014329 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014330 Py_INCREF(*p);
14331 }
Walter Dörwald16807132007-05-25 13:52:07 +000014332}
14333
14334PyObject *
14335PyUnicode_InternFromString(const char *cp)
14336{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014337 PyObject *s = PyUnicode_FromString(cp);
14338 if (s == NULL)
14339 return NULL;
14340 PyUnicode_InternInPlace(&s);
14341 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014342}
14343
Alexander Belopolsky40018472011-02-26 01:02:56 +000014344void
14345_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014346{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014347 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014348 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014349 Py_ssize_t i, n;
14350 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014351
Benjamin Peterson14339b62009-01-31 16:36:08 +000014352 if (interned == NULL || !PyDict_Check(interned))
14353 return;
14354 keys = PyDict_Keys(interned);
14355 if (keys == NULL || !PyList_Check(keys)) {
14356 PyErr_Clear();
14357 return;
14358 }
Walter Dörwald16807132007-05-25 13:52:07 +000014359
Benjamin Peterson14339b62009-01-31 16:36:08 +000014360 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14361 detector, interned unicode strings are not forcibly deallocated;
14362 rather, we give them their stolen references back, and then clear
14363 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014364
Benjamin Peterson14339b62009-01-31 16:36:08 +000014365 n = PyList_GET_SIZE(keys);
14366 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014367 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014368 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014369 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014370 if (PyUnicode_READY(s) == -1) {
14371 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014372 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014373 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014374 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014375 case SSTATE_NOT_INTERNED:
14376 /* XXX Shouldn't happen */
14377 break;
14378 case SSTATE_INTERNED_IMMORTAL:
14379 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014380 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014381 break;
14382 case SSTATE_INTERNED_MORTAL:
14383 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014384 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014385 break;
14386 default:
14387 Py_FatalError("Inconsistent interned string state.");
14388 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014389 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014390 }
14391 fprintf(stderr, "total size of all interned strings: "
14392 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14393 "mortal/immortal\n", mortal_size, immortal_size);
14394 Py_DECREF(keys);
14395 PyDict_Clear(interned);
14396 Py_DECREF(interned);
14397 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000014398}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014399
14400
14401/********************* Unicode Iterator **************************/
14402
14403typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014404 PyObject_HEAD
14405 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014406 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014407} unicodeiterobject;
14408
14409static void
14410unicodeiter_dealloc(unicodeiterobject *it)
14411{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014412 _PyObject_GC_UNTRACK(it);
14413 Py_XDECREF(it->it_seq);
14414 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014415}
14416
14417static int
14418unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14419{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014420 Py_VISIT(it->it_seq);
14421 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014422}
14423
14424static PyObject *
14425unicodeiter_next(unicodeiterobject *it)
14426{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014427 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014428
Benjamin Peterson14339b62009-01-31 16:36:08 +000014429 assert(it != NULL);
14430 seq = it->it_seq;
14431 if (seq == NULL)
14432 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014433 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014434
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014435 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14436 int kind = PyUnicode_KIND(seq);
14437 void *data = PyUnicode_DATA(seq);
14438 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14439 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014440 if (item != NULL)
14441 ++it->it_index;
14442 return item;
14443 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014444
Benjamin Peterson14339b62009-01-31 16:36:08 +000014445 Py_DECREF(seq);
14446 it->it_seq = NULL;
14447 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014448}
14449
14450static PyObject *
14451unicodeiter_len(unicodeiterobject *it)
14452{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014453 Py_ssize_t len = 0;
14454 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014455 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014456 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014457}
14458
14459PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14460
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014461static PyObject *
14462unicodeiter_reduce(unicodeiterobject *it)
14463{
14464 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020014465 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014466 it->it_seq, it->it_index);
14467 } else {
14468 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
14469 if (u == NULL)
14470 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020014471 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014472 }
14473}
14474
14475PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14476
14477static PyObject *
14478unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
14479{
14480 Py_ssize_t index = PyLong_AsSsize_t(state);
14481 if (index == -1 && PyErr_Occurred())
14482 return NULL;
14483 if (index < 0)
14484 index = 0;
14485 it->it_index = index;
14486 Py_RETURN_NONE;
14487}
14488
14489PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14490
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014491static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014492 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014493 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014494 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
14495 reduce_doc},
14496 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
14497 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014498 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014499};
14500
14501PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014502 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14503 "str_iterator", /* tp_name */
14504 sizeof(unicodeiterobject), /* tp_basicsize */
14505 0, /* tp_itemsize */
14506 /* methods */
14507 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14508 0, /* tp_print */
14509 0, /* tp_getattr */
14510 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014511 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014512 0, /* tp_repr */
14513 0, /* tp_as_number */
14514 0, /* tp_as_sequence */
14515 0, /* tp_as_mapping */
14516 0, /* tp_hash */
14517 0, /* tp_call */
14518 0, /* tp_str */
14519 PyObject_GenericGetAttr, /* tp_getattro */
14520 0, /* tp_setattro */
14521 0, /* tp_as_buffer */
14522 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14523 0, /* tp_doc */
14524 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14525 0, /* tp_clear */
14526 0, /* tp_richcompare */
14527 0, /* tp_weaklistoffset */
14528 PyObject_SelfIter, /* tp_iter */
14529 (iternextfunc)unicodeiter_next, /* tp_iternext */
14530 unicodeiter_methods, /* tp_methods */
14531 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014532};
14533
14534static PyObject *
14535unicode_iter(PyObject *seq)
14536{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014537 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014538
Benjamin Peterson14339b62009-01-31 16:36:08 +000014539 if (!PyUnicode_Check(seq)) {
14540 PyErr_BadInternalCall();
14541 return NULL;
14542 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014543 if (PyUnicode_READY(seq) == -1)
14544 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014545 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14546 if (it == NULL)
14547 return NULL;
14548 it->it_index = 0;
14549 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014550 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014551 _PyObject_GC_TRACK(it);
14552 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014553}
14554
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014555
14556size_t
14557Py_UNICODE_strlen(const Py_UNICODE *u)
14558{
14559 int res = 0;
14560 while(*u++)
14561 res++;
14562 return res;
14563}
14564
14565Py_UNICODE*
14566Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14567{
14568 Py_UNICODE *u = s1;
14569 while ((*u++ = *s2++));
14570 return s1;
14571}
14572
14573Py_UNICODE*
14574Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14575{
14576 Py_UNICODE *u = s1;
14577 while ((*u++ = *s2++))
14578 if (n-- == 0)
14579 break;
14580 return s1;
14581}
14582
14583Py_UNICODE*
14584Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14585{
14586 Py_UNICODE *u1 = s1;
14587 u1 += Py_UNICODE_strlen(u1);
14588 Py_UNICODE_strcpy(u1, s2);
14589 return s1;
14590}
14591
14592int
14593Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14594{
14595 while (*s1 && *s2 && *s1 == *s2)
14596 s1++, s2++;
14597 if (*s1 && *s2)
14598 return (*s1 < *s2) ? -1 : +1;
14599 if (*s1)
14600 return 1;
14601 if (*s2)
14602 return -1;
14603 return 0;
14604}
14605
14606int
14607Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14608{
14609 register Py_UNICODE u1, u2;
14610 for (; n != 0; n--) {
14611 u1 = *s1;
14612 u2 = *s2;
14613 if (u1 != u2)
14614 return (u1 < u2) ? -1 : +1;
14615 if (u1 == '\0')
14616 return 0;
14617 s1++;
14618 s2++;
14619 }
14620 return 0;
14621}
14622
14623Py_UNICODE*
14624Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14625{
14626 const Py_UNICODE *p;
14627 for (p = s; *p; p++)
14628 if (*p == c)
14629 return (Py_UNICODE*)p;
14630 return NULL;
14631}
14632
14633Py_UNICODE*
14634Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14635{
14636 const Py_UNICODE *p;
14637 p = s + Py_UNICODE_strlen(s);
14638 while (p != s) {
14639 p--;
14640 if (*p == c)
14641 return (Py_UNICODE*)p;
14642 }
14643 return NULL;
14644}
Victor Stinner331ea922010-08-10 16:37:20 +000014645
Victor Stinner71133ff2010-09-01 23:43:53 +000014646Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014647PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014648{
Victor Stinner577db2c2011-10-11 22:12:48 +020014649 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014650 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014651
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014652 if (!PyUnicode_Check(unicode)) {
14653 PyErr_BadArgument();
14654 return NULL;
14655 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014656 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014657 if (u == NULL)
14658 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014659 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014660 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014661 PyErr_NoMemory();
14662 return NULL;
14663 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014664 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014665 size *= sizeof(Py_UNICODE);
14666 copy = PyMem_Malloc(size);
14667 if (copy == NULL) {
14668 PyErr_NoMemory();
14669 return NULL;
14670 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014671 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014672 return copy;
14673}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014674
Georg Brandl66c221e2010-10-14 07:04:07 +000014675/* A _string module, to export formatter_parser and formatter_field_name_split
14676 to the string.Formatter class implemented in Python. */
14677
14678static PyMethodDef _string_methods[] = {
14679 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14680 METH_O, PyDoc_STR("split the argument as a field name")},
14681 {"formatter_parser", (PyCFunction) formatter_parser,
14682 METH_O, PyDoc_STR("parse the argument as a format string")},
14683 {NULL, NULL}
14684};
14685
14686static struct PyModuleDef _string_module = {
14687 PyModuleDef_HEAD_INIT,
14688 "_string",
14689 PyDoc_STR("string helper module"),
14690 0,
14691 _string_methods,
14692 NULL,
14693 NULL,
14694 NULL,
14695 NULL
14696};
14697
14698PyMODINIT_FUNC
14699PyInit__string(void)
14700{
14701 return PyModule_Create(&_string_module);
14702}
14703
14704
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014705#ifdef __cplusplus
14706}
14707#endif