blob: cbd2870c1d73ae421af93f0cecbcad5b043d91ab [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000050/* --- Globals ------------------------------------------------------------
51
Serhiy Storchaka05997252013-01-26 12:14:02 +020052NOTE: In the interpreter's initialization phase, some globals are currently
53 initialized dynamically as needed. In the process Unicode objects may
54 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000055
56*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000057
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000058
59#ifdef __cplusplus
60extern "C" {
61#endif
62
Victor Stinner8faf8212011-12-08 22:14:11 +010063/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
64#define MAX_UNICODE 0x10ffff
65
Victor Stinner910337b2011-10-03 03:20:16 +020066#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020067# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020068#else
69# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
70#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020071
Victor Stinnere90fe6a2011-10-01 16:48:13 +020072#define _PyUnicode_UTF8(op) \
73 (((PyCompactUnicodeObject*)(op))->utf8)
74#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020075 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020076 assert(PyUnicode_IS_READY(op)), \
77 PyUnicode_IS_COMPACT_ASCII(op) ? \
78 ((char*)((PyASCIIObject*)(op) + 1)) : \
79 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020080#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020081 (((PyCompactUnicodeObject*)(op))->utf8_length)
82#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020083 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020084 assert(PyUnicode_IS_READY(op)), \
85 PyUnicode_IS_COMPACT_ASCII(op) ? \
86 ((PyASCIIObject*)(op))->length : \
87 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020088#define _PyUnicode_WSTR(op) \
89 (((PyASCIIObject*)(op))->wstr)
90#define _PyUnicode_WSTR_LENGTH(op) \
91 (((PyCompactUnicodeObject*)(op))->wstr_length)
92#define _PyUnicode_LENGTH(op) \
93 (((PyASCIIObject *)(op))->length)
94#define _PyUnicode_STATE(op) \
95 (((PyASCIIObject *)(op))->state)
96#define _PyUnicode_HASH(op) \
97 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +020098#define _PyUnicode_KIND(op) \
99 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200100 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200101#define _PyUnicode_GET_LENGTH(op) \
102 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200103 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200104#define _PyUnicode_DATA_ANY(op) \
105 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106
Victor Stinnere6abb482012-05-02 01:15:40 +0200107/* Optimized version of Py_MAX() to compute the maximum character:
108 use it when your are computing the second argument of PyUnicode_New() */
109#define MAX_MAXCHAR(maxchar1, maxchar2) \
110 ((maxchar1) | (maxchar2))
111
Victor Stinner910337b2011-10-03 03:20:16 +0200112#undef PyUnicode_READY
113#define PyUnicode_READY(op) \
114 (assert(_PyUnicode_CHECK(op)), \
115 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200116 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100117 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200118
Victor Stinnerc379ead2011-10-03 12:52:27 +0200119#define _PyUnicode_SHARE_UTF8(op) \
120 (assert(_PyUnicode_CHECK(op)), \
121 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
122 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
123#define _PyUnicode_SHARE_WSTR(op) \
124 (assert(_PyUnicode_CHECK(op)), \
125 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
126
Victor Stinner829c0ad2011-10-03 01:08:02 +0200127/* true if the Unicode object has an allocated UTF-8 memory block
128 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200129#define _PyUnicode_HAS_UTF8_MEMORY(op) \
130 (assert(_PyUnicode_CHECK(op)), \
131 (!PyUnicode_IS_COMPACT_ASCII(op) \
132 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200133 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
134
Victor Stinner03490912011-10-03 23:45:12 +0200135/* true if the Unicode object has an allocated wstr memory block
136 (not shared with other data) */
137#define _PyUnicode_HAS_WSTR_MEMORY(op) \
138 (assert(_PyUnicode_CHECK(op)), \
139 (_PyUnicode_WSTR(op) && \
140 (!PyUnicode_IS_READY(op) || \
141 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
142
Victor Stinner910337b2011-10-03 03:20:16 +0200143/* Generic helper macro to convert characters of different types.
144 from_type and to_type have to be valid type names, begin and end
145 are pointers to the source characters which should be of type
146 "from_type *". to is a pointer of type "to_type *" and points to the
147 buffer where the result characters are written to. */
148#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
149 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200150 to_type *_to = (to_type *) to; \
151 const from_type *_iter = (begin); \
152 const from_type *_end = (end); \
153 Py_ssize_t n = (_end) - (_iter); \
154 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200155 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200156 while (_iter < (_unrolled_end)) { \
157 _to[0] = (to_type) _iter[0]; \
158 _to[1] = (to_type) _iter[1]; \
159 _to[2] = (to_type) _iter[2]; \
160 _to[3] = (to_type) _iter[3]; \
161 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200162 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200163 while (_iter < (_end)) \
164 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200165 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200166
Walter Dörwald16807132007-05-25 13:52:07 +0000167/* This dictionary holds all interned unicode strings. Note that references
168 to strings in this dictionary are *not* counted in the string's ob_refcnt.
169 When the interned string reaches a refcnt of 0 the string deallocation
170 function will delete the reference from this dictionary.
171
172 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000173 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000174*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200175static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000176
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000177/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200178static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200179
Serhiy Storchaka678db842013-01-26 12:16:36 +0200180#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200181 do { \
182 if (unicode_empty != NULL) \
183 Py_INCREF(unicode_empty); \
184 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200185 unicode_empty = PyUnicode_New(0, 0); \
186 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200187 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200188 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
189 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200190 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200191 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000192
Serhiy Storchaka678db842013-01-26 12:16:36 +0200193#define _Py_RETURN_UNICODE_EMPTY() \
194 do { \
195 _Py_INCREF_UNICODE_EMPTY(); \
196 return unicode_empty; \
197 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000198
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200199/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200200static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200201
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000202/* Single character Unicode strings in the Latin-1 range are being
203 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200204static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000205
Christian Heimes190d79e2008-01-30 11:58:22 +0000206/* Fast detection of the most frequent whitespace characters */
207const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000208 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000209/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000210/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000211/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000212/* case 0x000C: * FORM FEED */
213/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000214 0, 1, 1, 1, 1, 1, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000216/* case 0x001C: * FILE SEPARATOR */
217/* case 0x001D: * GROUP SEPARATOR */
218/* case 0x001E: * RECORD SEPARATOR */
219/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000220 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000221/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000222 1, 0, 0, 0, 0, 0, 0, 0,
223 0, 0, 0, 0, 0, 0, 0, 0,
224 0, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000226
Benjamin Peterson14339b62009-01-31 16:36:08 +0000227 0, 0, 0, 0, 0, 0, 0, 0,
228 0, 0, 0, 0, 0, 0, 0, 0,
229 0, 0, 0, 0, 0, 0, 0, 0,
230 0, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0,
232 0, 0, 0, 0, 0, 0, 0, 0,
233 0, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000235};
236
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200237/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200238static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200239static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100240static int unicode_modifiable(PyObject *unicode);
241
Victor Stinnerfe226c02011-10-03 03:52:20 +0200242
Alexander Belopolsky40018472011-02-26 01:02:56 +0000243static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200244_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
245static PyObject *
246_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
247static PyObject *
248_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
249
250static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000251unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000252 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100253 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000254 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
255
Alexander Belopolsky40018472011-02-26 01:02:56 +0000256static void
257raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300258 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100259 PyObject *unicode,
260 Py_ssize_t startpos, Py_ssize_t endpos,
261 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000262
Christian Heimes190d79e2008-01-30 11:58:22 +0000263/* Same for linebreaks */
264static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000265 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000266/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000267/* 0x000B, * LINE TABULATION */
268/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000269/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000270 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000271 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000272/* 0x001C, * FILE SEPARATOR */
273/* 0x001D, * GROUP SEPARATOR */
274/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000275 0, 0, 0, 0, 1, 1, 1, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
277 0, 0, 0, 0, 0, 0, 0, 0,
278 0, 0, 0, 0, 0, 0, 0, 0,
279 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000280
Benjamin Peterson14339b62009-01-31 16:36:08 +0000281 0, 0, 0, 0, 0, 0, 0, 0,
282 0, 0, 0, 0, 0, 0, 0, 0,
283 0, 0, 0, 0, 0, 0, 0, 0,
284 0, 0, 0, 0, 0, 0, 0, 0,
285 0, 0, 0, 0, 0, 0, 0, 0,
286 0, 0, 0, 0, 0, 0, 0, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000289};
290
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300291/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
292 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000293Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000294PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000295{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000296#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000297 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000298#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000299 /* This is actually an illegal character, so it should
300 not be passed to unichr. */
301 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000302#endif
303}
304
Victor Stinner910337b2011-10-03 03:20:16 +0200305#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200306int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100307_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200308{
309 PyASCIIObject *ascii;
310 unsigned int kind;
311
312 assert(PyUnicode_Check(op));
313
314 ascii = (PyASCIIObject *)op;
315 kind = ascii->state.kind;
316
Victor Stinnera3b334d2011-10-03 13:53:37 +0200317 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200318 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200319 assert(ascii->state.ready == 1);
320 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200321 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200322 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200323 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200324
Victor Stinnera41463c2011-10-04 01:05:08 +0200325 if (ascii->state.compact == 1) {
326 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200327 assert(kind == PyUnicode_1BYTE_KIND
328 || kind == PyUnicode_2BYTE_KIND
329 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200330 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200331 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200332 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100333 }
334 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200335 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
336
337 data = unicode->data.any;
338 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100339 assert(ascii->length == 0);
340 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200341 assert(ascii->state.compact == 0);
342 assert(ascii->state.ascii == 0);
343 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100344 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200345 assert(ascii->wstr != NULL);
346 assert(data == NULL);
347 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200348 }
349 else {
350 assert(kind == PyUnicode_1BYTE_KIND
351 || kind == PyUnicode_2BYTE_KIND
352 || kind == PyUnicode_4BYTE_KIND);
353 assert(ascii->state.compact == 0);
354 assert(ascii->state.ready == 1);
355 assert(data != NULL);
356 if (ascii->state.ascii) {
357 assert (compact->utf8 == data);
358 assert (compact->utf8_length == ascii->length);
359 }
360 else
361 assert (compact->utf8 != data);
362 }
363 }
364 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200365 if (
366#if SIZEOF_WCHAR_T == 2
367 kind == PyUnicode_2BYTE_KIND
368#else
369 kind == PyUnicode_4BYTE_KIND
370#endif
371 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200372 {
373 assert(ascii->wstr == data);
374 assert(compact->wstr_length == ascii->length);
375 } else
376 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200377 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200378
379 if (compact->utf8 == NULL)
380 assert(compact->utf8_length == 0);
381 if (ascii->wstr == NULL)
382 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200383 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200384 /* check that the best kind is used */
385 if (check_content && kind != PyUnicode_WCHAR_KIND)
386 {
387 Py_ssize_t i;
388 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200389 void *data;
390 Py_UCS4 ch;
391
392 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200393 for (i=0; i < ascii->length; i++)
394 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200395 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200396 if (ch > maxchar)
397 maxchar = ch;
398 }
399 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100400 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200401 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100402 assert(maxchar <= 255);
403 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200404 else
405 assert(maxchar < 128);
406 }
Victor Stinner77faf692011-11-20 18:56:05 +0100407 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200408 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100409 assert(maxchar <= 0xFFFF);
410 }
411 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200412 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100413 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100414 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200415 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200416 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400417 return 1;
418}
Victor Stinner910337b2011-10-03 03:20:16 +0200419#endif
420
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100421static PyObject*
422unicode_result_wchar(PyObject *unicode)
423{
424#ifndef Py_DEBUG
425 Py_ssize_t len;
426
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100427 len = _PyUnicode_WSTR_LENGTH(unicode);
428 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100429 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200430 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100431 }
432
433 if (len == 1) {
434 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
435 if (ch < 256) {
436 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
437 Py_DECREF(unicode);
438 return latin1_char;
439 }
440 }
441
442 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200443 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100444 return NULL;
445 }
446#else
Victor Stinneraa771272012-10-04 02:32:58 +0200447 assert(Py_REFCNT(unicode) == 1);
448
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100449 /* don't make the result ready in debug mode to ensure that the caller
450 makes the string ready before using it */
451 assert(_PyUnicode_CheckConsistency(unicode, 1));
452#endif
453 return unicode;
454}
455
456static PyObject*
457unicode_result_ready(PyObject *unicode)
458{
459 Py_ssize_t length;
460
461 length = PyUnicode_GET_LENGTH(unicode);
462 if (length == 0) {
463 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100464 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200465 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100466 }
467 return unicode_empty;
468 }
469
470 if (length == 1) {
471 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
472 if (ch < 256) {
473 PyObject *latin1_char = unicode_latin1[ch];
474 if (latin1_char != NULL) {
475 if (unicode != latin1_char) {
476 Py_INCREF(latin1_char);
477 Py_DECREF(unicode);
478 }
479 return latin1_char;
480 }
481 else {
482 assert(_PyUnicode_CheckConsistency(unicode, 1));
483 Py_INCREF(unicode);
484 unicode_latin1[ch] = unicode;
485 return unicode;
486 }
487 }
488 }
489
490 assert(_PyUnicode_CheckConsistency(unicode, 1));
491 return unicode;
492}
493
494static PyObject*
495unicode_result(PyObject *unicode)
496{
497 assert(_PyUnicode_CHECK(unicode));
498 if (PyUnicode_IS_READY(unicode))
499 return unicode_result_ready(unicode);
500 else
501 return unicode_result_wchar(unicode);
502}
503
Victor Stinnerc4b49542011-12-11 22:44:26 +0100504static PyObject*
505unicode_result_unchanged(PyObject *unicode)
506{
507 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500508 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100509 return NULL;
510 Py_INCREF(unicode);
511 return unicode;
512 }
513 else
514 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100515 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100516}
517
Victor Stinner3a50e702011-10-18 21:21:00 +0200518#ifdef HAVE_MBCS
519static OSVERSIONINFOEX winver;
520#endif
521
Thomas Wouters477c8d52006-05-27 19:21:47 +0000522/* --- Bloom Filters ----------------------------------------------------- */
523
524/* stuff to implement simple "bloom filters" for Unicode characters.
525 to keep things simple, we use a single bitmask, using the least 5
526 bits from each unicode characters as the bit index. */
527
528/* the linebreak mask is set up by Unicode_Init below */
529
Antoine Pitrouf068f942010-01-13 14:19:12 +0000530#if LONG_BIT >= 128
531#define BLOOM_WIDTH 128
532#elif LONG_BIT >= 64
533#define BLOOM_WIDTH 64
534#elif LONG_BIT >= 32
535#define BLOOM_WIDTH 32
536#else
537#error "LONG_BIT is smaller than 32"
538#endif
539
Thomas Wouters477c8d52006-05-27 19:21:47 +0000540#define BLOOM_MASK unsigned long
541
Serhiy Storchaka05997252013-01-26 12:14:02 +0200542static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000543
Antoine Pitrouf068f942010-01-13 14:19:12 +0000544#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
545#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000546
Benjamin Peterson29060642009-01-31 22:14:21 +0000547#define BLOOM_LINEBREAK(ch) \
548 ((ch) < 128U ? ascii_linebreak[(ch)] : \
549 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000550
Alexander Belopolsky40018472011-02-26 01:02:56 +0000551Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200552make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000553{
554 /* calculate simple bloom-style bitmask for a given unicode string */
555
Antoine Pitrouf068f942010-01-13 14:19:12 +0000556 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000557 Py_ssize_t i;
558
559 mask = 0;
560 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200561 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000562
563 return mask;
564}
565
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200566#define BLOOM_MEMBER(mask, chr, str) \
567 (BLOOM(mask, chr) \
568 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000569
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200570/* Compilation of templated routines */
571
572#include "stringlib/asciilib.h"
573#include "stringlib/fastsearch.h"
574#include "stringlib/partition.h"
575#include "stringlib/split.h"
576#include "stringlib/count.h"
577#include "stringlib/find.h"
578#include "stringlib/find_max_char.h"
579#include "stringlib/localeutil.h"
580#include "stringlib/undef.h"
581
582#include "stringlib/ucs1lib.h"
583#include "stringlib/fastsearch.h"
584#include "stringlib/partition.h"
585#include "stringlib/split.h"
586#include "stringlib/count.h"
587#include "stringlib/find.h"
588#include "stringlib/find_max_char.h"
589#include "stringlib/localeutil.h"
590#include "stringlib/undef.h"
591
592#include "stringlib/ucs2lib.h"
593#include "stringlib/fastsearch.h"
594#include "stringlib/partition.h"
595#include "stringlib/split.h"
596#include "stringlib/count.h"
597#include "stringlib/find.h"
598#include "stringlib/find_max_char.h"
599#include "stringlib/localeutil.h"
600#include "stringlib/undef.h"
601
602#include "stringlib/ucs4lib.h"
603#include "stringlib/fastsearch.h"
604#include "stringlib/partition.h"
605#include "stringlib/split.h"
606#include "stringlib/count.h"
607#include "stringlib/find.h"
608#include "stringlib/find_max_char.h"
609#include "stringlib/localeutil.h"
610#include "stringlib/undef.h"
611
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200612#include "stringlib/unicodedefs.h"
613#include "stringlib/fastsearch.h"
614#include "stringlib/count.h"
615#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100616#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200617
Guido van Rossumd57fd912000-03-10 22:53:23 +0000618/* --- Unicode Object ----------------------------------------------------- */
619
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200620static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200621fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200622
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200623Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
624 Py_ssize_t size, Py_UCS4 ch,
625 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200626{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200627 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
628
629 switch (kind) {
630 case PyUnicode_1BYTE_KIND:
631 {
632 Py_UCS1 ch1 = (Py_UCS1) ch;
633 if (ch1 == ch)
634 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
635 else
636 return -1;
637 }
638 case PyUnicode_2BYTE_KIND:
639 {
640 Py_UCS2 ch2 = (Py_UCS2) ch;
641 if (ch2 == ch)
642 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
643 else
644 return -1;
645 }
646 case PyUnicode_4BYTE_KIND:
647 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
648 default:
649 assert(0);
650 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200651 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200652}
653
Victor Stinnerafffce42012-10-03 23:03:17 +0200654#ifdef Py_DEBUG
655/* Fill the data of an Unicode string with invalid characters to detect bugs
656 earlier.
657
658 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
659 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
660 invalid character in Unicode 6.0. */
661static void
662unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
663{
664 int kind = PyUnicode_KIND(unicode);
665 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
666 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
667 if (length <= old_length)
668 return;
669 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
670}
671#endif
672
Victor Stinnerfe226c02011-10-03 03:52:20 +0200673static PyObject*
674resize_compact(PyObject *unicode, Py_ssize_t length)
675{
676 Py_ssize_t char_size;
677 Py_ssize_t struct_size;
678 Py_ssize_t new_size;
679 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100680 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200681#ifdef Py_DEBUG
682 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
683#endif
684
Victor Stinner79891572012-05-03 13:43:07 +0200685 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200686 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100687 assert(PyUnicode_IS_COMPACT(unicode));
688
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200689 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100690 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200691 struct_size = sizeof(PyASCIIObject);
692 else
693 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200694 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200695
Victor Stinnerfe226c02011-10-03 03:52:20 +0200696 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
697 PyErr_NoMemory();
698 return NULL;
699 }
700 new_size = (struct_size + (length + 1) * char_size);
701
Victor Stinner84def372011-12-11 20:04:56 +0100702 _Py_DEC_REFTOTAL;
703 _Py_ForgetReference(unicode);
704
705 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
706 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100707 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200708 PyErr_NoMemory();
709 return NULL;
710 }
Victor Stinner84def372011-12-11 20:04:56 +0100711 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200712 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100713
Victor Stinnerfe226c02011-10-03 03:52:20 +0200714 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200715 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200716 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100717 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200718 _PyUnicode_WSTR_LENGTH(unicode) = length;
719 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200720#ifdef Py_DEBUG
721 unicode_fill_invalid(unicode, old_length);
722#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200723 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
724 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200725 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200726 return unicode;
727}
728
Alexander Belopolsky40018472011-02-26 01:02:56 +0000729static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200730resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000731{
Victor Stinner95663112011-10-04 01:03:50 +0200732 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100733 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200734 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200735 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000736
Victor Stinnerfe226c02011-10-03 03:52:20 +0200737 if (PyUnicode_IS_READY(unicode)) {
738 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200739 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200740 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200741#ifdef Py_DEBUG
742 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
743#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200744
745 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200746 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200747 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
748 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200749
750 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
751 PyErr_NoMemory();
752 return -1;
753 }
754 new_size = (length + 1) * char_size;
755
Victor Stinner7a9105a2011-12-12 00:13:42 +0100756 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
757 {
758 PyObject_DEL(_PyUnicode_UTF8(unicode));
759 _PyUnicode_UTF8(unicode) = NULL;
760 _PyUnicode_UTF8_LENGTH(unicode) = 0;
761 }
762
Victor Stinnerfe226c02011-10-03 03:52:20 +0200763 data = (PyObject *)PyObject_REALLOC(data, new_size);
764 if (data == NULL) {
765 PyErr_NoMemory();
766 return -1;
767 }
768 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200769 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200770 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200771 _PyUnicode_WSTR_LENGTH(unicode) = length;
772 }
773 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200774 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200775 _PyUnicode_UTF8_LENGTH(unicode) = length;
776 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200777 _PyUnicode_LENGTH(unicode) = length;
778 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200779#ifdef Py_DEBUG
780 unicode_fill_invalid(unicode, old_length);
781#endif
Victor Stinner95663112011-10-04 01:03:50 +0200782 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200783 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200784 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200785 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200786 }
Victor Stinner95663112011-10-04 01:03:50 +0200787 assert(_PyUnicode_WSTR(unicode) != NULL);
788
789 /* check for integer overflow */
790 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
791 PyErr_NoMemory();
792 return -1;
793 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100794 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200795 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100796 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200797 if (!wstr) {
798 PyErr_NoMemory();
799 return -1;
800 }
801 _PyUnicode_WSTR(unicode) = wstr;
802 _PyUnicode_WSTR(unicode)[length] = 0;
803 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200804 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000805 return 0;
806}
807
Victor Stinnerfe226c02011-10-03 03:52:20 +0200808static PyObject*
809resize_copy(PyObject *unicode, Py_ssize_t length)
810{
811 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100812 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200813 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100814
Benjamin Petersonbac79492012-01-14 13:34:47 -0500815 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100816 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200817
818 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
819 if (copy == NULL)
820 return NULL;
821
822 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +0200823 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200824 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200825 }
826 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200827 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100828
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200829 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200830 if (w == NULL)
831 return NULL;
832 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
833 copy_length = Py_MIN(copy_length, length);
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +0200834 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
835 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200836 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200837 }
838}
839
Guido van Rossumd57fd912000-03-10 22:53:23 +0000840/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000841 Ux0000 terminated; some code (e.g. new_identifier)
842 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000843
844 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000845 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000846
847*/
848
Alexander Belopolsky40018472011-02-26 01:02:56 +0000849static PyUnicodeObject *
850_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000851{
852 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200853 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000854
Thomas Wouters477c8d52006-05-27 19:21:47 +0000855 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000856 if (length == 0 && unicode_empty != NULL) {
857 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200858 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000859 }
860
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000861 /* Ensure we won't overflow the size. */
862 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
863 return (PyUnicodeObject *)PyErr_NoMemory();
864 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200865 if (length < 0) {
866 PyErr_SetString(PyExc_SystemError,
867 "Negative size passed to _PyUnicode_New");
868 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000869 }
870
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200871 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
872 if (unicode == NULL)
873 return NULL;
874 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
875 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
876 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100877 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000878 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100879 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000880 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200881
Jeremy Hyltond8082792003-09-16 19:41:39 +0000882 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000883 * the caller fails before initializing str -- unicode_resize()
884 * reads str[0], and the Keep-Alive optimization can keep memory
885 * allocated for str alive across a call to unicode_dealloc(unicode).
886 * We don't want unicode_resize to read uninitialized memory in
887 * that case.
888 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200889 _PyUnicode_WSTR(unicode)[0] = 0;
890 _PyUnicode_WSTR(unicode)[length] = 0;
891 _PyUnicode_WSTR_LENGTH(unicode) = length;
892 _PyUnicode_HASH(unicode) = -1;
893 _PyUnicode_STATE(unicode).interned = 0;
894 _PyUnicode_STATE(unicode).kind = 0;
895 _PyUnicode_STATE(unicode).compact = 0;
896 _PyUnicode_STATE(unicode).ready = 0;
897 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200898 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200899 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200900 _PyUnicode_UTF8(unicode) = NULL;
901 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100902 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000903 return unicode;
904}
905
Victor Stinnerf42dc442011-10-02 23:33:16 +0200906static const char*
907unicode_kind_name(PyObject *unicode)
908{
Victor Stinner42dfd712011-10-03 14:41:45 +0200909 /* don't check consistency: unicode_kind_name() is called from
910 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200911 if (!PyUnicode_IS_COMPACT(unicode))
912 {
913 if (!PyUnicode_IS_READY(unicode))
914 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600915 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200916 {
917 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200918 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200919 return "legacy ascii";
920 else
921 return "legacy latin1";
922 case PyUnicode_2BYTE_KIND:
923 return "legacy UCS2";
924 case PyUnicode_4BYTE_KIND:
925 return "legacy UCS4";
926 default:
927 return "<legacy invalid kind>";
928 }
929 }
930 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600931 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200932 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200933 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200934 return "ascii";
935 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200936 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200937 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200938 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200939 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200940 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200941 default:
942 return "<invalid compact kind>";
943 }
944}
945
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200946#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200947/* Functions wrapping macros for use in debugger */
948char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200949 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200950}
951
952void *_PyUnicode_compact_data(void *unicode) {
953 return _PyUnicode_COMPACT_DATA(unicode);
954}
955void *_PyUnicode_data(void *unicode){
956 printf("obj %p\n", unicode);
957 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
958 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
959 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
960 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
961 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
962 return PyUnicode_DATA(unicode);
963}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200964
965void
966_PyUnicode_Dump(PyObject *op)
967{
968 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200969 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
970 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
971 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200972
Victor Stinnera849a4b2011-10-03 12:12:11 +0200973 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200974 {
975 if (ascii->state.ascii)
976 data = (ascii + 1);
977 else
978 data = (compact + 1);
979 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200980 else
981 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200982 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
983
Victor Stinnera849a4b2011-10-03 12:12:11 +0200984 if (ascii->wstr == data)
985 printf("shared ");
986 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200987
Victor Stinnera3b334d2011-10-03 13:53:37 +0200988 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200989 printf(" (%zu), ", compact->wstr_length);
990 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
991 printf("shared ");
992 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200993 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200994 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200995}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200996#endif
997
998PyObject *
999PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1000{
1001 PyObject *obj;
1002 PyCompactUnicodeObject *unicode;
1003 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001004 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001005 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001006 Py_ssize_t char_size;
1007 Py_ssize_t struct_size;
1008
1009 /* Optimization for empty strings */
1010 if (size == 0 && unicode_empty != NULL) {
1011 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001012 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001013 }
1014
Victor Stinner9e9d6892011-10-04 01:02:02 +02001015 is_ascii = 0;
1016 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001017 struct_size = sizeof(PyCompactUnicodeObject);
1018 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001019 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001020 char_size = 1;
1021 is_ascii = 1;
1022 struct_size = sizeof(PyASCIIObject);
1023 }
1024 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001025 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001026 char_size = 1;
1027 }
1028 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001029 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001030 char_size = 2;
1031 if (sizeof(wchar_t) == 2)
1032 is_sharing = 1;
1033 }
1034 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001035 if (maxchar > MAX_UNICODE) {
1036 PyErr_SetString(PyExc_SystemError,
1037 "invalid maximum character passed to PyUnicode_New");
1038 return NULL;
1039 }
Victor Stinner8f825062012-04-27 13:55:39 +02001040 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001041 char_size = 4;
1042 if (sizeof(wchar_t) == 4)
1043 is_sharing = 1;
1044 }
1045
1046 /* Ensure we won't overflow the size. */
1047 if (size < 0) {
1048 PyErr_SetString(PyExc_SystemError,
1049 "Negative size passed to PyUnicode_New");
1050 return NULL;
1051 }
1052 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1053 return PyErr_NoMemory();
1054
1055 /* Duplicated allocation code from _PyObject_New() instead of a call to
1056 * PyObject_New() so we are able to allocate space for the object and
1057 * it's data buffer.
1058 */
1059 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1060 if (obj == NULL)
1061 return PyErr_NoMemory();
1062 obj = PyObject_INIT(obj, &PyUnicode_Type);
1063 if (obj == NULL)
1064 return NULL;
1065
1066 unicode = (PyCompactUnicodeObject *)obj;
1067 if (is_ascii)
1068 data = ((PyASCIIObject*)obj) + 1;
1069 else
1070 data = unicode + 1;
1071 _PyUnicode_LENGTH(unicode) = size;
1072 _PyUnicode_HASH(unicode) = -1;
1073 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001074 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001075 _PyUnicode_STATE(unicode).compact = 1;
1076 _PyUnicode_STATE(unicode).ready = 1;
1077 _PyUnicode_STATE(unicode).ascii = is_ascii;
1078 if (is_ascii) {
1079 ((char*)data)[size] = 0;
1080 _PyUnicode_WSTR(unicode) = NULL;
1081 }
Victor Stinner8f825062012-04-27 13:55:39 +02001082 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001083 ((char*)data)[size] = 0;
1084 _PyUnicode_WSTR(unicode) = NULL;
1085 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001086 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001087 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001088 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001089 else {
1090 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001091 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001092 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001093 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001094 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001095 ((Py_UCS4*)data)[size] = 0;
1096 if (is_sharing) {
1097 _PyUnicode_WSTR_LENGTH(unicode) = size;
1098 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1099 }
1100 else {
1101 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1102 _PyUnicode_WSTR(unicode) = NULL;
1103 }
1104 }
Victor Stinner8f825062012-04-27 13:55:39 +02001105#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001106 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001107#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001108 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001109 return obj;
1110}
1111
1112#if SIZEOF_WCHAR_T == 2
1113/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1114 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001115 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001116
1117 This function assumes that unicode can hold one more code point than wstr
1118 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001119static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001120unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001121 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001122{
1123 const wchar_t *iter;
1124 Py_UCS4 *ucs4_out;
1125
Victor Stinner910337b2011-10-03 03:20:16 +02001126 assert(unicode != NULL);
1127 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001128 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1129 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1130
1131 for (iter = begin; iter < end; ) {
1132 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1133 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001134 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1135 && (iter+1) < end
1136 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001137 {
Victor Stinner551ac952011-11-29 22:58:13 +01001138 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001139 iter += 2;
1140 }
1141 else {
1142 *ucs4_out++ = *iter;
1143 iter++;
1144 }
1145 }
1146 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1147 _PyUnicode_GET_LENGTH(unicode)));
1148
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001149}
1150#endif
1151
Victor Stinnercd9950f2011-10-02 00:34:53 +02001152static int
Victor Stinner488fa492011-12-12 00:01:39 +01001153unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001154{
Victor Stinner488fa492011-12-12 00:01:39 +01001155 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001156 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001157 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001158 return -1;
1159 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001160 return 0;
1161}
1162
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001163static int
1164_copy_characters(PyObject *to, Py_ssize_t to_start,
1165 PyObject *from, Py_ssize_t from_start,
1166 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001167{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001168 unsigned int from_kind, to_kind;
1169 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001170
Victor Stinneree4544c2012-05-09 22:24:08 +02001171 assert(0 <= how_many);
1172 assert(0 <= from_start);
1173 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001174 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001175 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001176 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001177
Victor Stinnerd3f08822012-05-29 12:57:52 +02001178 assert(PyUnicode_Check(to));
1179 assert(PyUnicode_IS_READY(to));
1180 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1181
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001182 if (how_many == 0)
1183 return 0;
1184
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001185 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001186 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001187 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001188 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001189
Victor Stinnerf1852262012-06-16 16:38:26 +02001190#ifdef Py_DEBUG
1191 if (!check_maxchar
1192 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1193 {
1194 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1195 Py_UCS4 ch;
1196 Py_ssize_t i;
1197 for (i=0; i < how_many; i++) {
1198 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1199 assert(ch <= to_maxchar);
1200 }
1201 }
1202#endif
1203
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001204 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001205 if (check_maxchar
1206 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1207 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001208 /* Writing Latin-1 characters into an ASCII string requires to
1209 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001210 Py_UCS4 max_char;
1211 max_char = ucs1lib_find_max_char(from_data,
1212 (Py_UCS1*)from_data + how_many);
1213 if (max_char >= 128)
1214 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001215 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001216 Py_MEMCPY((char*)to_data + to_kind * to_start,
1217 (char*)from_data + from_kind * from_start,
1218 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001219 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001220 else if (from_kind == PyUnicode_1BYTE_KIND
1221 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001222 {
1223 _PyUnicode_CONVERT_BYTES(
1224 Py_UCS1, Py_UCS2,
1225 PyUnicode_1BYTE_DATA(from) + from_start,
1226 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1227 PyUnicode_2BYTE_DATA(to) + to_start
1228 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001229 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001230 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001231 && to_kind == PyUnicode_4BYTE_KIND)
1232 {
1233 _PyUnicode_CONVERT_BYTES(
1234 Py_UCS1, Py_UCS4,
1235 PyUnicode_1BYTE_DATA(from) + from_start,
1236 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1237 PyUnicode_4BYTE_DATA(to) + to_start
1238 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001239 }
1240 else if (from_kind == PyUnicode_2BYTE_KIND
1241 && to_kind == PyUnicode_4BYTE_KIND)
1242 {
1243 _PyUnicode_CONVERT_BYTES(
1244 Py_UCS2, Py_UCS4,
1245 PyUnicode_2BYTE_DATA(from) + from_start,
1246 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1247 PyUnicode_4BYTE_DATA(to) + to_start
1248 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001249 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001250 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001251 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1252
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001253 if (!check_maxchar) {
1254 if (from_kind == PyUnicode_2BYTE_KIND
1255 && to_kind == PyUnicode_1BYTE_KIND)
1256 {
1257 _PyUnicode_CONVERT_BYTES(
1258 Py_UCS2, Py_UCS1,
1259 PyUnicode_2BYTE_DATA(from) + from_start,
1260 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1261 PyUnicode_1BYTE_DATA(to) + to_start
1262 );
1263 }
1264 else if (from_kind == PyUnicode_4BYTE_KIND
1265 && to_kind == PyUnicode_1BYTE_KIND)
1266 {
1267 _PyUnicode_CONVERT_BYTES(
1268 Py_UCS4, Py_UCS1,
1269 PyUnicode_4BYTE_DATA(from) + from_start,
1270 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1271 PyUnicode_1BYTE_DATA(to) + to_start
1272 );
1273 }
1274 else if (from_kind == PyUnicode_4BYTE_KIND
1275 && to_kind == PyUnicode_2BYTE_KIND)
1276 {
1277 _PyUnicode_CONVERT_BYTES(
1278 Py_UCS4, Py_UCS2,
1279 PyUnicode_4BYTE_DATA(from) + from_start,
1280 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1281 PyUnicode_2BYTE_DATA(to) + to_start
1282 );
1283 }
1284 else {
1285 assert(0);
1286 return -1;
1287 }
1288 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001289 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001290 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001291 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001292 Py_ssize_t i;
1293
Victor Stinnera0702ab2011-09-29 14:14:38 +02001294 for (i=0; i < how_many; i++) {
1295 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001296 if (ch > to_maxchar)
1297 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001298 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1299 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001300 }
1301 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001302 return 0;
1303}
1304
Victor Stinnerd3f08822012-05-29 12:57:52 +02001305void
1306_PyUnicode_FastCopyCharacters(
1307 PyObject *to, Py_ssize_t to_start,
1308 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001309{
1310 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1311}
1312
1313Py_ssize_t
1314PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1315 PyObject *from, Py_ssize_t from_start,
1316 Py_ssize_t how_many)
1317{
1318 int err;
1319
1320 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1321 PyErr_BadInternalCall();
1322 return -1;
1323 }
1324
Benjamin Petersonbac79492012-01-14 13:34:47 -05001325 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001326 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001327 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001328 return -1;
1329
Victor Stinnerd3f08822012-05-29 12:57:52 +02001330 if (from_start < 0) {
1331 PyErr_SetString(PyExc_IndexError, "string index out of range");
1332 return -1;
1333 }
1334 if (to_start < 0) {
1335 PyErr_SetString(PyExc_IndexError, "string index out of range");
1336 return -1;
1337 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001338 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1339 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1340 PyErr_Format(PyExc_SystemError,
1341 "Cannot write %zi characters at %zi "
1342 "in a string of %zi characters",
1343 how_many, to_start, PyUnicode_GET_LENGTH(to));
1344 return -1;
1345 }
1346
1347 if (how_many == 0)
1348 return 0;
1349
Victor Stinner488fa492011-12-12 00:01:39 +01001350 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001351 return -1;
1352
1353 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1354 if (err) {
1355 PyErr_Format(PyExc_SystemError,
1356 "Cannot copy %s characters "
1357 "into a string of %s characters",
1358 unicode_kind_name(from),
1359 unicode_kind_name(to));
1360 return -1;
1361 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001362 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001363}
1364
Victor Stinner17222162011-09-28 22:15:37 +02001365/* Find the maximum code point and count the number of surrogate pairs so a
1366 correct string length can be computed before converting a string to UCS4.
1367 This function counts single surrogates as a character and not as a pair.
1368
1369 Return 0 on success, or -1 on error. */
1370static int
1371find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1372 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001373{
1374 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001375 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001376
Victor Stinnerc53be962011-10-02 21:33:54 +02001377 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001378 *num_surrogates = 0;
1379 *maxchar = 0;
1380
1381 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001382#if SIZEOF_WCHAR_T == 2
Victor Stinnerca4f2072011-11-22 03:38:40 +01001383 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1384 && (iter+1) < end
1385 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001386 {
Victor Stinner8faf8212011-12-08 22:14:11 +01001387 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001388 ++(*num_surrogates);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001389 iter += 2;
1390 }
1391 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001392#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001393 {
1394 ch = *iter;
1395 iter++;
1396 }
1397 if (ch > *maxchar) {
1398 *maxchar = ch;
1399 if (*maxchar > MAX_UNICODE) {
1400 PyErr_Format(PyExc_ValueError,
1401 "character U+%x is not in range [U+0000; U+10ffff]",
1402 ch);
1403 return -1;
1404 }
1405 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001406 }
1407 return 0;
1408}
1409
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001410int
1411_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001412{
1413 wchar_t *end;
1414 Py_UCS4 maxchar = 0;
1415 Py_ssize_t num_surrogates;
1416#if SIZEOF_WCHAR_T == 2
1417 Py_ssize_t length_wo_surrogates;
1418#endif
1419
Georg Brandl7597add2011-10-05 16:36:47 +02001420 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001421 strings were created using _PyObject_New() and where no canonical
1422 representation (the str field) has been set yet aka strings
1423 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001424 assert(_PyUnicode_CHECK(unicode));
1425 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001426 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001427 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001428 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001429 /* Actually, it should neither be interned nor be anything else: */
1430 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001431
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001432 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001433 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001434 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001435 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001436
1437 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001438 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1439 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001440 PyErr_NoMemory();
1441 return -1;
1442 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001443 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001444 _PyUnicode_WSTR(unicode), end,
1445 PyUnicode_1BYTE_DATA(unicode));
1446 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1447 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1448 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1449 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001450 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001451 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001452 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001453 }
1454 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001455 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001456 _PyUnicode_UTF8(unicode) = NULL;
1457 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001458 }
1459 PyObject_FREE(_PyUnicode_WSTR(unicode));
1460 _PyUnicode_WSTR(unicode) = NULL;
1461 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1462 }
1463 /* In this case we might have to convert down from 4-byte native
1464 wchar_t to 2-byte unicode. */
1465 else if (maxchar < 65536) {
1466 assert(num_surrogates == 0 &&
1467 "FindMaxCharAndNumSurrogatePairs() messed up");
1468
Victor Stinner506f5922011-09-28 22:34:18 +02001469#if SIZEOF_WCHAR_T == 2
1470 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001471 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001472 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1473 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1474 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001475 _PyUnicode_UTF8(unicode) = NULL;
1476 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001477#else
1478 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001479 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001480 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001481 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001482 PyErr_NoMemory();
1483 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001484 }
Victor Stinner506f5922011-09-28 22:34:18 +02001485 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1486 _PyUnicode_WSTR(unicode), end,
1487 PyUnicode_2BYTE_DATA(unicode));
1488 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1489 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1490 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001491 _PyUnicode_UTF8(unicode) = NULL;
1492 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001493 PyObject_FREE(_PyUnicode_WSTR(unicode));
1494 _PyUnicode_WSTR(unicode) = NULL;
1495 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1496#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001497 }
1498 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1499 else {
1500#if SIZEOF_WCHAR_T == 2
1501 /* in case the native representation is 2-bytes, we need to allocate a
1502 new normalized 4-byte version. */
1503 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001504 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1505 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001506 PyErr_NoMemory();
1507 return -1;
1508 }
1509 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1510 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001511 _PyUnicode_UTF8(unicode) = NULL;
1512 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001513 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1514 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001515 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001516 PyObject_FREE(_PyUnicode_WSTR(unicode));
1517 _PyUnicode_WSTR(unicode) = NULL;
1518 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1519#else
1520 assert(num_surrogates == 0);
1521
Victor Stinnerc3c74152011-10-02 20:39:55 +02001522 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001523 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001524 _PyUnicode_UTF8(unicode) = NULL;
1525 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001526 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1527#endif
1528 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1529 }
1530 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001531 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001532 return 0;
1533}
1534
Alexander Belopolsky40018472011-02-26 01:02:56 +00001535static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001536unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001537{
Walter Dörwald16807132007-05-25 13:52:07 +00001538 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001539 case SSTATE_NOT_INTERNED:
1540 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001541
Benjamin Peterson29060642009-01-31 22:14:21 +00001542 case SSTATE_INTERNED_MORTAL:
1543 /* revive dead object temporarily for DelItem */
1544 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001545 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001546 Py_FatalError(
1547 "deletion of interned string failed");
1548 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001549
Benjamin Peterson29060642009-01-31 22:14:21 +00001550 case SSTATE_INTERNED_IMMORTAL:
1551 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001552
Benjamin Peterson29060642009-01-31 22:14:21 +00001553 default:
1554 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001555 }
1556
Victor Stinner03490912011-10-03 23:45:12 +02001557 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001558 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001559 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001560 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001561 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1562 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001563
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001564 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001565}
1566
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001567#ifdef Py_DEBUG
1568static int
1569unicode_is_singleton(PyObject *unicode)
1570{
1571 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1572 if (unicode == unicode_empty)
1573 return 1;
1574 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1575 {
1576 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1577 if (ch < 256 && unicode_latin1[ch] == unicode)
1578 return 1;
1579 }
1580 return 0;
1581}
1582#endif
1583
Alexander Belopolsky40018472011-02-26 01:02:56 +00001584static int
Victor Stinner488fa492011-12-12 00:01:39 +01001585unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001586{
Victor Stinner488fa492011-12-12 00:01:39 +01001587 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001588 if (Py_REFCNT(unicode) != 1)
1589 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001590 if (_PyUnicode_HASH(unicode) != -1)
1591 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001592 if (PyUnicode_CHECK_INTERNED(unicode))
1593 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001594 if (!PyUnicode_CheckExact(unicode))
1595 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001596#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001597 /* singleton refcount is greater than 1 */
1598 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001599#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001600 return 1;
1601}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001602
Victor Stinnerfe226c02011-10-03 03:52:20 +02001603static int
1604unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1605{
1606 PyObject *unicode;
1607 Py_ssize_t old_length;
1608
1609 assert(p_unicode != NULL);
1610 unicode = *p_unicode;
1611
1612 assert(unicode != NULL);
1613 assert(PyUnicode_Check(unicode));
1614 assert(0 <= length);
1615
Victor Stinner910337b2011-10-03 03:20:16 +02001616 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001617 old_length = PyUnicode_WSTR_LENGTH(unicode);
1618 else
1619 old_length = PyUnicode_GET_LENGTH(unicode);
1620 if (old_length == length)
1621 return 0;
1622
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001623 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001624 _Py_INCREF_UNICODE_EMPTY();
1625 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001626 return -1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001627 Py_DECREF(*p_unicode);
1628 *p_unicode = unicode_empty;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001629 return 0;
1630 }
1631
Victor Stinner488fa492011-12-12 00:01:39 +01001632 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001633 PyObject *copy = resize_copy(unicode, length);
1634 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001635 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001636 Py_DECREF(*p_unicode);
1637 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001638 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001639 }
1640
Victor Stinnerfe226c02011-10-03 03:52:20 +02001641 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001642 PyObject *new_unicode = resize_compact(unicode, length);
1643 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001644 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001645 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001646 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001647 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001648 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001649}
1650
Alexander Belopolsky40018472011-02-26 01:02:56 +00001651int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001652PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001653{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001654 PyObject *unicode;
1655 if (p_unicode == NULL) {
1656 PyErr_BadInternalCall();
1657 return -1;
1658 }
1659 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001660 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001661 {
1662 PyErr_BadInternalCall();
1663 return -1;
1664 }
1665 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001666}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001667
Victor Stinnerc5166102012-02-22 13:55:02 +01001668/* Copy a ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001669
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001670 WARNING: The function doesn't copy the terminating null character and
1671 doesn't check the maximum character (may write a latin1 character in an
1672 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001673static void
1674unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1675 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001676{
1677 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1678 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001679 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001680
1681 switch (kind) {
1682 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001683 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001684#ifdef Py_DEBUG
1685 if (PyUnicode_IS_ASCII(unicode)) {
1686 Py_UCS4 maxchar = ucs1lib_find_max_char(
1687 (const Py_UCS1*)str,
1688 (const Py_UCS1*)str + len);
1689 assert(maxchar < 128);
1690 }
1691#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001692 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001693 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001694 }
1695 case PyUnicode_2BYTE_KIND: {
1696 Py_UCS2 *start = (Py_UCS2 *)data + index;
1697 Py_UCS2 *ucs2 = start;
1698 assert(index <= PyUnicode_GET_LENGTH(unicode));
1699
Victor Stinner184252a2012-06-16 02:57:41 +02001700 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001701 *ucs2 = (Py_UCS2)*str;
1702
1703 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001704 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001705 }
1706 default: {
1707 Py_UCS4 *start = (Py_UCS4 *)data + index;
1708 Py_UCS4 *ucs4 = start;
1709 assert(kind == PyUnicode_4BYTE_KIND);
1710 assert(index <= PyUnicode_GET_LENGTH(unicode));
1711
Victor Stinner184252a2012-06-16 02:57:41 +02001712 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001713 *ucs4 = (Py_UCS4)*str;
1714
1715 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001716 }
1717 }
1718}
1719
1720
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001721static PyObject*
1722get_latin1_char(unsigned char ch)
1723{
Victor Stinnera464fc12011-10-02 20:39:30 +02001724 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001725 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001726 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001727 if (!unicode)
1728 return NULL;
1729 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001730 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001731 unicode_latin1[ch] = unicode;
1732 }
1733 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001734 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001735}
1736
Alexander Belopolsky40018472011-02-26 01:02:56 +00001737PyObject *
1738PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001739{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001740 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001741 Py_UCS4 maxchar = 0;
1742 Py_ssize_t num_surrogates;
1743
1744 if (u == NULL)
1745 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001746
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001747 /* If the Unicode data is known at construction time, we can apply
1748 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001749
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001750 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02001751 if (size == 0)
1752 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00001753
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001754 /* Single character Unicode objects in the Latin-1 range are
1755 shared when using this constructor */
1756 if (size == 1 && *u < 256)
1757 return get_latin1_char((unsigned char)*u);
1758
1759 /* If not empty and not single character, copy the Unicode data
1760 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001761 if (find_maxchar_surrogates(u, u + size,
1762 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001763 return NULL;
1764
Victor Stinner8faf8212011-12-08 22:14:11 +01001765 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001766 if (!unicode)
1767 return NULL;
1768
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001769 switch (PyUnicode_KIND(unicode)) {
1770 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001771 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001772 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1773 break;
1774 case PyUnicode_2BYTE_KIND:
1775#if Py_UNICODE_SIZE == 2
1776 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1777#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001778 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001779 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1780#endif
1781 break;
1782 case PyUnicode_4BYTE_KIND:
1783#if SIZEOF_WCHAR_T == 2
1784 /* This is the only case which has to process surrogates, thus
1785 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001786 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001787#else
1788 assert(num_surrogates == 0);
1789 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1790#endif
1791 break;
1792 default:
1793 assert(0 && "Impossible state");
1794 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001795
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001796 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001797}
1798
Alexander Belopolsky40018472011-02-26 01:02:56 +00001799PyObject *
1800PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001801{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001802 if (size < 0) {
1803 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001804 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001805 return NULL;
1806 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001807 if (u != NULL)
1808 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1809 else
1810 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001811}
1812
Alexander Belopolsky40018472011-02-26 01:02:56 +00001813PyObject *
1814PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001815{
1816 size_t size = strlen(u);
1817 if (size > PY_SSIZE_T_MAX) {
1818 PyErr_SetString(PyExc_OverflowError, "input too long");
1819 return NULL;
1820 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001821 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001822}
1823
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001824PyObject *
1825_PyUnicode_FromId(_Py_Identifier *id)
1826{
1827 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001828 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1829 strlen(id->string),
1830 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001831 if (!id->object)
1832 return NULL;
1833 PyUnicode_InternInPlace(&id->object);
1834 assert(!id->next);
1835 id->next = static_strings;
1836 static_strings = id;
1837 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001838 return id->object;
1839}
1840
1841void
1842_PyUnicode_ClearStaticStrings()
1843{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001844 _Py_Identifier *tmp, *s = static_strings;
1845 while (s) {
1846 Py_DECREF(s->object);
1847 s->object = NULL;
1848 tmp = s->next;
1849 s->next = NULL;
1850 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001851 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001852 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001853}
1854
Benjamin Peterson0df54292012-03-26 14:50:32 -04001855/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001856
Victor Stinnerd3f08822012-05-29 12:57:52 +02001857PyObject*
1858_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001859{
Victor Stinnerd3f08822012-05-29 12:57:52 +02001860 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01001861 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001862 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001863#ifdef Py_DEBUG
Victor Stinnere6b2d442011-12-11 21:54:30 +01001864 assert(s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001865#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001866 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001867 }
Victor Stinner785938e2011-12-11 20:09:03 +01001868 unicode = PyUnicode_New(size, 127);
1869 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001870 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001871 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1872 assert(_PyUnicode_CheckConsistency(unicode, 1));
1873 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001874}
1875
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001876static Py_UCS4
1877kind_maxchar_limit(unsigned int kind)
1878{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001879 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001880 case PyUnicode_1BYTE_KIND:
1881 return 0x80;
1882 case PyUnicode_2BYTE_KIND:
1883 return 0x100;
1884 case PyUnicode_4BYTE_KIND:
1885 return 0x10000;
1886 default:
1887 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001888 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001889 }
1890}
1891
Victor Stinnere6abb482012-05-02 01:15:40 +02001892Py_LOCAL_INLINE(Py_UCS4)
1893align_maxchar(Py_UCS4 maxchar)
1894{
1895 if (maxchar <= 127)
1896 return 127;
1897 else if (maxchar <= 255)
1898 return 255;
1899 else if (maxchar <= 65535)
1900 return 65535;
1901 else
1902 return MAX_UNICODE;
1903}
1904
Victor Stinner702c7342011-10-05 13:50:52 +02001905static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001906_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001907{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001908 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001909 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001910
Serhiy Storchaka678db842013-01-26 12:16:36 +02001911 if (size == 0)
1912 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001913 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001914 if (size == 1)
1915 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001916
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001917 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001918 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001919 if (!res)
1920 return NULL;
1921 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001922 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001923 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001924}
1925
Victor Stinnere57b1c02011-09-28 22:20:48 +02001926static PyObject*
1927_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001928{
1929 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001930 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001931
Serhiy Storchaka678db842013-01-26 12:16:36 +02001932 if (size == 0)
1933 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001934 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001935 if (size == 1) {
1936 Py_UCS4 ch = u[0];
1937 if (ch < 256)
1938 return get_latin1_char((unsigned char)ch);
1939
1940 res = PyUnicode_New(1, ch);
1941 if (res == NULL)
1942 return NULL;
1943 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1944 assert(_PyUnicode_CheckConsistency(res, 1));
1945 return res;
1946 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001947
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001948 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001949 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001950 if (!res)
1951 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001952 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001953 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001954 else {
1955 _PyUnicode_CONVERT_BYTES(
1956 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1957 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001958 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001959 return res;
1960}
1961
Victor Stinnere57b1c02011-09-28 22:20:48 +02001962static PyObject*
1963_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001964{
1965 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001966 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001967
Serhiy Storchaka678db842013-01-26 12:16:36 +02001968 if (size == 0)
1969 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001970 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001971 if (size == 1) {
1972 Py_UCS4 ch = u[0];
1973 if (ch < 256)
1974 return get_latin1_char((unsigned char)ch);
1975
1976 res = PyUnicode_New(1, ch);
1977 if (res == NULL)
1978 return NULL;
1979 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1980 assert(_PyUnicode_CheckConsistency(res, 1));
1981 return res;
1982 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001983
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001984 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001985 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001986 if (!res)
1987 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001988 if (max_char < 256)
1989 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1990 PyUnicode_1BYTE_DATA(res));
1991 else if (max_char < 0x10000)
1992 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1993 PyUnicode_2BYTE_DATA(res));
1994 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001995 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001996 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001997 return res;
1998}
1999
2000PyObject*
2001PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2002{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002003 if (size < 0) {
2004 PyErr_SetString(PyExc_ValueError, "size must be positive");
2005 return NULL;
2006 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002007 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002008 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002009 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002010 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002011 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002012 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002013 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002014 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002015 PyErr_SetString(PyExc_SystemError, "invalid kind");
2016 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002017 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002018}
2019
Victor Stinnerece58de2012-04-23 23:36:38 +02002020Py_UCS4
2021_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2022{
2023 enum PyUnicode_Kind kind;
2024 void *startptr, *endptr;
2025
2026 assert(PyUnicode_IS_READY(unicode));
2027 assert(0 <= start);
2028 assert(end <= PyUnicode_GET_LENGTH(unicode));
2029 assert(start <= end);
2030
2031 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2032 return PyUnicode_MAX_CHAR_VALUE(unicode);
2033
2034 if (start == end)
2035 return 127;
2036
Victor Stinner94d558b2012-04-27 22:26:58 +02002037 if (PyUnicode_IS_ASCII(unicode))
2038 return 127;
2039
Victor Stinnerece58de2012-04-23 23:36:38 +02002040 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002041 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002042 endptr = (char *)startptr + end * kind;
2043 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002044 switch(kind) {
2045 case PyUnicode_1BYTE_KIND:
2046 return ucs1lib_find_max_char(startptr, endptr);
2047 case PyUnicode_2BYTE_KIND:
2048 return ucs2lib_find_max_char(startptr, endptr);
2049 case PyUnicode_4BYTE_KIND:
2050 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002051 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002052 assert(0);
2053 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002054 }
2055}
2056
Victor Stinner25a4b292011-10-06 12:31:55 +02002057/* Ensure that a string uses the most efficient storage, if it is not the
2058 case: create a new string with of the right kind. Write NULL into *p_unicode
2059 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002060static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002061unicode_adjust_maxchar(PyObject **p_unicode)
2062{
2063 PyObject *unicode, *copy;
2064 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002065 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002066 unsigned int kind;
2067
2068 assert(p_unicode != NULL);
2069 unicode = *p_unicode;
2070 assert(PyUnicode_IS_READY(unicode));
2071 if (PyUnicode_IS_ASCII(unicode))
2072 return;
2073
2074 len = PyUnicode_GET_LENGTH(unicode);
2075 kind = PyUnicode_KIND(unicode);
2076 if (kind == PyUnicode_1BYTE_KIND) {
2077 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002078 max_char = ucs1lib_find_max_char(u, u + len);
2079 if (max_char >= 128)
2080 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002081 }
2082 else if (kind == PyUnicode_2BYTE_KIND) {
2083 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002084 max_char = ucs2lib_find_max_char(u, u + len);
2085 if (max_char >= 256)
2086 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002087 }
2088 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002089 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002090 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002091 max_char = ucs4lib_find_max_char(u, u + len);
2092 if (max_char >= 0x10000)
2093 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002094 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002095 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002096 if (copy != NULL)
2097 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002098 Py_DECREF(unicode);
2099 *p_unicode = copy;
2100}
2101
Victor Stinner034f6cf2011-09-30 02:26:44 +02002102PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002103_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002104{
Victor Stinner87af4f22011-11-21 23:03:47 +01002105 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002106 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002107
Victor Stinner034f6cf2011-09-30 02:26:44 +02002108 if (!PyUnicode_Check(unicode)) {
2109 PyErr_BadInternalCall();
2110 return NULL;
2111 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002112 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002113 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002114
Victor Stinner87af4f22011-11-21 23:03:47 +01002115 length = PyUnicode_GET_LENGTH(unicode);
2116 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002117 if (!copy)
2118 return NULL;
2119 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2120
Victor Stinner87af4f22011-11-21 23:03:47 +01002121 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2122 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002123 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002124 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002125}
2126
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002127
Victor Stinnerbc603d12011-10-02 01:00:40 +02002128/* Widen Unicode objects to larger buffers. Don't write terminating null
2129 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002130
2131void*
2132_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2133{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002134 Py_ssize_t len;
2135 void *result;
2136 unsigned int skind;
2137
Benjamin Petersonbac79492012-01-14 13:34:47 -05002138 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002139 return NULL;
2140
2141 len = PyUnicode_GET_LENGTH(s);
2142 skind = PyUnicode_KIND(s);
2143 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002144 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002145 return NULL;
2146 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002147 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002148 case PyUnicode_2BYTE_KIND:
2149 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2150 if (!result)
2151 return PyErr_NoMemory();
2152 assert(skind == PyUnicode_1BYTE_KIND);
2153 _PyUnicode_CONVERT_BYTES(
2154 Py_UCS1, Py_UCS2,
2155 PyUnicode_1BYTE_DATA(s),
2156 PyUnicode_1BYTE_DATA(s) + len,
2157 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002158 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002159 case PyUnicode_4BYTE_KIND:
2160 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2161 if (!result)
2162 return PyErr_NoMemory();
2163 if (skind == PyUnicode_2BYTE_KIND) {
2164 _PyUnicode_CONVERT_BYTES(
2165 Py_UCS2, Py_UCS4,
2166 PyUnicode_2BYTE_DATA(s),
2167 PyUnicode_2BYTE_DATA(s) + len,
2168 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002169 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002170 else {
2171 assert(skind == PyUnicode_1BYTE_KIND);
2172 _PyUnicode_CONVERT_BYTES(
2173 Py_UCS1, Py_UCS4,
2174 PyUnicode_1BYTE_DATA(s),
2175 PyUnicode_1BYTE_DATA(s) + len,
2176 result);
2177 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002178 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002179 default:
2180 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002181 }
Victor Stinner01698042011-10-04 00:04:26 +02002182 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002183 return NULL;
2184}
2185
2186static Py_UCS4*
2187as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2188 int copy_null)
2189{
2190 int kind;
2191 void *data;
2192 Py_ssize_t len, targetlen;
2193 if (PyUnicode_READY(string) == -1)
2194 return NULL;
2195 kind = PyUnicode_KIND(string);
2196 data = PyUnicode_DATA(string);
2197 len = PyUnicode_GET_LENGTH(string);
2198 targetlen = len;
2199 if (copy_null)
2200 targetlen++;
2201 if (!target) {
2202 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2203 PyErr_NoMemory();
2204 return NULL;
2205 }
2206 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2207 if (!target) {
2208 PyErr_NoMemory();
2209 return NULL;
2210 }
2211 }
2212 else {
2213 if (targetsize < targetlen) {
2214 PyErr_Format(PyExc_SystemError,
2215 "string is longer than the buffer");
2216 if (copy_null && 0 < targetsize)
2217 target[0] = 0;
2218 return NULL;
2219 }
2220 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002221 if (kind == PyUnicode_1BYTE_KIND) {
2222 Py_UCS1 *start = (Py_UCS1 *) data;
2223 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002224 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002225 else if (kind == PyUnicode_2BYTE_KIND) {
2226 Py_UCS2 *start = (Py_UCS2 *) data;
2227 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2228 }
2229 else {
2230 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002231 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002232 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002233 if (copy_null)
2234 target[len] = 0;
2235 return target;
2236}
2237
2238Py_UCS4*
2239PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2240 int copy_null)
2241{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002242 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002243 PyErr_BadInternalCall();
2244 return NULL;
2245 }
2246 return as_ucs4(string, target, targetsize, copy_null);
2247}
2248
2249Py_UCS4*
2250PyUnicode_AsUCS4Copy(PyObject *string)
2251{
2252 return as_ucs4(string, NULL, 0, 1);
2253}
2254
2255#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002256
Alexander Belopolsky40018472011-02-26 01:02:56 +00002257PyObject *
2258PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002259{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002260 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002261 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002262 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002263 PyErr_BadInternalCall();
2264 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002265 }
2266
Martin v. Löwis790465f2008-04-05 20:41:37 +00002267 if (size == -1) {
2268 size = wcslen(w);
2269 }
2270
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002271 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002272}
2273
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002274#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002275
Walter Dörwald346737f2007-05-31 10:44:43 +00002276static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002277makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
Victor Stinnere215d962012-10-06 23:03:36 +02002278 char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002279{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002280 *fmt++ = '%';
Benjamin Peterson14339b62009-01-31 16:36:08 +00002281 if (longflag)
2282 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002283 else if (longlongflag) {
2284 /* longlongflag should only ever be nonzero on machines with
2285 HAVE_LONG_LONG defined */
2286#ifdef HAVE_LONG_LONG
2287 char *f = PY_FORMAT_LONG_LONG;
2288 while (*f)
2289 *fmt++ = *f++;
2290#else
2291 /* we shouldn't ever get here */
2292 assert(0);
2293 *fmt++ = 'l';
2294#endif
2295 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002296 else if (size_tflag) {
2297 char *f = PY_FORMAT_SIZE_T;
2298 while (*f)
2299 *fmt++ = *f++;
2300 }
2301 *fmt++ = c;
2302 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002303}
2304
Victor Stinner15a11362012-10-06 23:48:20 +02002305/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002306 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2307 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2308#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002309
2310static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002311unicode_fromformat_arg(_PyUnicodeWriter *writer,
2312 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002313{
Victor Stinnere215d962012-10-06 23:03:36 +02002314 const char *p;
2315 Py_ssize_t len;
2316 int zeropad;
2317 int width;
2318 int precision;
2319 int longflag;
2320 int longlongflag;
2321 int size_tflag;
2322 int fill;
2323
2324 p = f;
2325 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002326 zeropad = 0;
2327 if (*f == '0') {
2328 zeropad = 1;
2329 f++;
2330 }
Victor Stinner96865452011-03-01 23:44:09 +00002331
2332 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner96865452011-03-01 23:44:09 +00002333 width = 0;
Victor Stinnere215d962012-10-06 23:03:36 +02002334 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner3921e902012-10-06 23:05:00 +02002335 if (width > (INT_MAX - ((int)*f - '0')) / 10) {
2336 PyErr_SetString(PyExc_ValueError,
2337 "width too big");
2338 return NULL;
2339 }
Victor Stinnere215d962012-10-06 23:03:36 +02002340 width = (width*10) + (*f - '0');
2341 f++;
2342 }
Victor Stinner96865452011-03-01 23:44:09 +00002343 precision = 0;
2344 if (*f == '.') {
2345 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002346 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner3921e902012-10-06 23:05:00 +02002347 if (precision > (INT_MAX - ((int)*f - '0')) / 10) {
2348 PyErr_SetString(PyExc_ValueError,
2349 "precision too big");
2350 return NULL;
2351 }
Victor Stinnere215d962012-10-06 23:03:36 +02002352 precision = (precision*10) + (*f - '0');
2353 f++;
2354 }
Victor Stinner96865452011-03-01 23:44:09 +00002355 if (*f == '%') {
2356 /* "%.3%s" => f points to "3" */
2357 f--;
2358 }
2359 }
2360 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002361 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002362 f--;
2363 }
Victor Stinner96865452011-03-01 23:44:09 +00002364
2365 /* Handle %ld, %lu, %lld and %llu. */
2366 longflag = 0;
2367 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002368 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002369 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002370 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002371 longflag = 1;
2372 ++f;
2373 }
2374#ifdef HAVE_LONG_LONG
2375 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002376 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002377 longlongflag = 1;
2378 f += 2;
2379 }
2380#endif
2381 }
2382 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002383 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002384 size_tflag = 1;
2385 ++f;
2386 }
Victor Stinnere215d962012-10-06 23:03:36 +02002387
2388 if (f[1] == '\0')
2389 writer->overallocate = 0;
2390
2391 switch (*f) {
2392 case 'c':
2393 {
2394 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002395 if (ordinal < 0 || ordinal > MAX_UNICODE) {
2396 PyErr_SetString(PyExc_ValueError,
2397 "character argument not in range(0x110000)");
2398 return NULL;
2399 }
Victor Stinnere215d962012-10-06 23:03:36 +02002400 if (_PyUnicodeWriter_Prepare(writer, 1, ordinal) == -1)
2401 return NULL;
2402 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ordinal);
2403 writer->pos++;
2404 break;
2405 }
2406
2407 case 'i':
2408 case 'd':
2409 case 'u':
2410 case 'x':
2411 {
2412 /* used by sprintf */
2413 char fmt[10]; /* should be enough for "%0lld\0" */
Victor Stinner15a11362012-10-06 23:48:20 +02002414 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinnere215d962012-10-06 23:03:36 +02002415
2416 if (*f == 'u') {
2417 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2418
2419 if (longflag)
2420 len = sprintf(buffer, fmt,
2421 va_arg(*vargs, unsigned long));
2422#ifdef HAVE_LONG_LONG
2423 else if (longlongflag)
2424 len = sprintf(buffer, fmt,
2425 va_arg(*vargs, unsigned PY_LONG_LONG));
2426#endif
2427 else if (size_tflag)
2428 len = sprintf(buffer, fmt,
2429 va_arg(*vargs, size_t));
2430 else
2431 len = sprintf(buffer, fmt,
2432 va_arg(*vargs, unsigned int));
2433 }
2434 else if (*f == 'x') {
2435 makefmt(fmt, 0, 0, 0, 'x');
2436 len = sprintf(buffer, fmt, va_arg(*vargs, int));
2437 }
2438 else {
2439 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2440
2441 if (longflag)
2442 len = sprintf(buffer, fmt,
2443 va_arg(*vargs, long));
2444#ifdef HAVE_LONG_LONG
2445 else if (longlongflag)
2446 len = sprintf(buffer, fmt,
2447 va_arg(*vargs, PY_LONG_LONG));
2448#endif
2449 else if (size_tflag)
2450 len = sprintf(buffer, fmt,
2451 va_arg(*vargs, Py_ssize_t));
2452 else
2453 len = sprintf(buffer, fmt,
2454 va_arg(*vargs, int));
2455 }
2456 assert(len >= 0);
2457
Victor Stinnere215d962012-10-06 23:03:36 +02002458 if (precision < len)
2459 precision = len;
2460 if (width > precision) {
2461 Py_UCS4 fillchar;
2462 fill = width - precision;
2463 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002464 if (_PyUnicodeWriter_Prepare(writer, fill, fillchar) == -1)
2465 return NULL;
2466 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2467 return NULL;
2468 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002469 }
Victor Stinner15a11362012-10-06 23:48:20 +02002470 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002471 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002472 if (_PyUnicodeWriter_Prepare(writer, fill, '0') == -1)
2473 return NULL;
2474 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2475 return NULL;
2476 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002477 }
Victor Stinner15a11362012-10-06 23:48:20 +02002478 if (_PyUnicodeWriter_WriteCstr(writer, buffer, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002479 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002480 break;
2481 }
2482
2483 case 'p':
2484 {
2485 char number[MAX_LONG_LONG_CHARS];
2486
2487 len = sprintf(number, "%p", va_arg(*vargs, void*));
2488 assert(len >= 0);
2489
2490 /* %p is ill-defined: ensure leading 0x. */
2491 if (number[1] == 'X')
2492 number[1] = 'x';
2493 else if (number[1] != 'x') {
2494 memmove(number + 2, number,
2495 strlen(number) + 1);
2496 number[0] = '0';
2497 number[1] = 'x';
2498 len += 2;
2499 }
2500
2501 if (_PyUnicodeWriter_WriteCstr(writer, number, len) == -1)
2502 return NULL;
2503 break;
2504 }
2505
2506 case 's':
2507 {
2508 /* UTF-8 */
2509 const char *s = va_arg(*vargs, const char*);
2510 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
2511 if (!str)
2512 return NULL;
2513 if (_PyUnicodeWriter_WriteStr(writer, str) == -1) {
2514 Py_DECREF(str);
2515 return NULL;
2516 }
2517 Py_DECREF(str);
2518 break;
2519 }
2520
2521 case 'U':
2522 {
2523 PyObject *obj = va_arg(*vargs, PyObject *);
2524 assert(obj && _PyUnicode_CHECK(obj));
2525
2526 if (_PyUnicodeWriter_WriteStr(writer, obj) == -1)
2527 return NULL;
2528 break;
2529 }
2530
2531 case 'V':
2532 {
2533 PyObject *obj = va_arg(*vargs, PyObject *);
2534 const char *str = va_arg(*vargs, const char *);
2535 PyObject *str_obj;
2536 assert(obj || str);
2537 if (obj) {
2538 assert(_PyUnicode_CHECK(obj));
2539 if (_PyUnicodeWriter_WriteStr(writer, obj) == -1)
2540 return NULL;
2541 }
2542 else {
2543 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
2544 if (!str_obj)
2545 return NULL;
2546 if (_PyUnicodeWriter_WriteStr(writer, str_obj) == -1) {
2547 Py_DECREF(str_obj);
2548 return NULL;
2549 }
2550 Py_DECREF(str_obj);
2551 }
2552 break;
2553 }
2554
2555 case 'S':
2556 {
2557 PyObject *obj = va_arg(*vargs, PyObject *);
2558 PyObject *str;
2559 assert(obj);
2560 str = PyObject_Str(obj);
2561 if (!str)
2562 return NULL;
2563 if (_PyUnicodeWriter_WriteStr(writer, str) == -1) {
2564 Py_DECREF(str);
2565 return NULL;
2566 }
2567 Py_DECREF(str);
2568 break;
2569 }
2570
2571 case 'R':
2572 {
2573 PyObject *obj = va_arg(*vargs, PyObject *);
2574 PyObject *repr;
2575 assert(obj);
2576 repr = PyObject_Repr(obj);
2577 if (!repr)
2578 return NULL;
2579 if (_PyUnicodeWriter_WriteStr(writer, repr) == -1) {
2580 Py_DECREF(repr);
2581 return NULL;
2582 }
2583 Py_DECREF(repr);
2584 break;
2585 }
2586
2587 case 'A':
2588 {
2589 PyObject *obj = va_arg(*vargs, PyObject *);
2590 PyObject *ascii;
2591 assert(obj);
2592 ascii = PyObject_ASCII(obj);
2593 if (!ascii)
2594 return NULL;
2595 if (_PyUnicodeWriter_WriteStr(writer, ascii) == -1) {
2596 Py_DECREF(ascii);
2597 return NULL;
2598 }
2599 Py_DECREF(ascii);
2600 break;
2601 }
2602
2603 case '%':
2604 if (_PyUnicodeWriter_Prepare(writer, 1, '%') == 1)
2605 return NULL;
2606 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '%');
2607 writer->pos++;
2608 break;
2609
2610 default:
2611 /* if we stumble upon an unknown formatting code, copy the rest
2612 of the format string to the output string. (we cannot just
2613 skip the code, since there's no way to know what's in the
2614 argument list) */
2615 len = strlen(p);
2616 if (_PyUnicodeWriter_WriteCstr(writer, p, len) == -1)
2617 return NULL;
2618 f = p+len;
2619 return f;
2620 }
2621
2622 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002623 return f;
2624}
2625
Walter Dörwaldd2034312007-05-18 16:29:38 +00002626PyObject *
2627PyUnicode_FromFormatV(const char *format, va_list vargs)
2628{
Victor Stinnere215d962012-10-06 23:03:36 +02002629 va_list vargs2;
2630 const char *f;
2631 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002632
Victor Stinnere215d962012-10-06 23:03:36 +02002633 _PyUnicodeWriter_Init(&writer, strlen(format) + 100);
2634
2635 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2636 Copy it to be able to pass a reference to a subfunction. */
2637 Py_VA_COPY(vargs2, vargs);
2638
2639 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002640 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002641 f = unicode_fromformat_arg(&writer, f, &vargs2);
2642 if (f == NULL)
2643 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002644 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002645 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002646 const char *p;
2647 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002648
Victor Stinnere215d962012-10-06 23:03:36 +02002649 p = f;
2650 do
2651 {
2652 if ((unsigned char)*p > 127) {
2653 PyErr_Format(PyExc_ValueError,
2654 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2655 "string, got a non-ASCII byte: 0x%02x",
2656 (unsigned char)*p);
2657 return NULL;
2658 }
2659 p++;
2660 }
2661 while (*p != '\0' && *p != '%');
2662 len = p - f;
2663
2664 if (*p == '\0')
2665 writer.overallocate = 0;
2666 if (_PyUnicodeWriter_Prepare(&writer, len, 127) == -1)
2667 goto fail;
2668 unicode_write_cstr(writer.buffer, writer.pos, f, len);
2669 writer.pos += len;
2670
2671 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002672 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002673 }
Victor Stinnere215d962012-10-06 23:03:36 +02002674 return _PyUnicodeWriter_Finish(&writer);
2675
2676 fail:
2677 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002678 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002679}
2680
Walter Dörwaldd2034312007-05-18 16:29:38 +00002681PyObject *
2682PyUnicode_FromFormat(const char *format, ...)
2683{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002684 PyObject* ret;
2685 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002686
2687#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002688 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002689#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002690 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002691#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002692 ret = PyUnicode_FromFormatV(format, vargs);
2693 va_end(vargs);
2694 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002695}
2696
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002697#ifdef HAVE_WCHAR_H
2698
Victor Stinner5593d8a2010-10-02 11:11:27 +00002699/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2700 convert a Unicode object to a wide character string.
2701
Victor Stinnerd88d9832011-09-06 02:00:05 +02002702 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002703 character) required to convert the unicode object. Ignore size argument.
2704
Victor Stinnerd88d9832011-09-06 02:00:05 +02002705 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002706 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002707 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002708static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002709unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002710 wchar_t *w,
2711 Py_ssize_t size)
2712{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002713 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002714 const wchar_t *wstr;
2715
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002716 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002717 if (wstr == NULL)
2718 return -1;
2719
Victor Stinner5593d8a2010-10-02 11:11:27 +00002720 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002721 if (size > res)
2722 size = res + 1;
2723 else
2724 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002725 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002726 return res;
2727 }
2728 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002729 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002730}
2731
2732Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002733PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002734 wchar_t *w,
2735 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002736{
2737 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002738 PyErr_BadInternalCall();
2739 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002740 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002741 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002742}
2743
Victor Stinner137c34c2010-09-29 10:25:54 +00002744wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002745PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002746 Py_ssize_t *size)
2747{
2748 wchar_t* buffer;
2749 Py_ssize_t buflen;
2750
2751 if (unicode == NULL) {
2752 PyErr_BadInternalCall();
2753 return NULL;
2754 }
2755
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002756 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002757 if (buflen == -1)
2758 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002759 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002760 PyErr_NoMemory();
2761 return NULL;
2762 }
2763
Victor Stinner137c34c2010-09-29 10:25:54 +00002764 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2765 if (buffer == NULL) {
2766 PyErr_NoMemory();
2767 return NULL;
2768 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002769 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02002770 if (buflen == -1) {
2771 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002772 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02002773 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00002774 if (size != NULL)
2775 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002776 return buffer;
2777}
2778
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002779#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002780
Alexander Belopolsky40018472011-02-26 01:02:56 +00002781PyObject *
2782PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002783{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002784 PyObject *v;
Victor Stinner8faf8212011-12-08 22:14:11 +01002785 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002786 PyErr_SetString(PyExc_ValueError,
2787 "chr() arg not in range(0x110000)");
2788 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002789 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002790
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002791 if (ordinal < 256)
2792 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002793
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002794 v = PyUnicode_New(1, ordinal);
2795 if (v == NULL)
2796 return NULL;
2797 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002798 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002799 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002800}
2801
Alexander Belopolsky40018472011-02-26 01:02:56 +00002802PyObject *
2803PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002804{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002805 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002806 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002807 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002808 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002809 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002810 Py_INCREF(obj);
2811 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002812 }
2813 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002814 /* For a Unicode subtype that's not a Unicode object,
2815 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002816 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002817 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002818 PyErr_Format(PyExc_TypeError,
2819 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002820 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002821 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002822}
2823
Alexander Belopolsky40018472011-02-26 01:02:56 +00002824PyObject *
2825PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002826 const char *encoding,
2827 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002828{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002829 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002830 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002831
Guido van Rossumd57fd912000-03-10 22:53:23 +00002832 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002833 PyErr_BadInternalCall();
2834 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002835 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002836
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002837 /* Decoding bytes objects is the most common case and should be fast */
2838 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002839 if (PyBytes_GET_SIZE(obj) == 0)
2840 _Py_RETURN_UNICODE_EMPTY();
2841 v = PyUnicode_Decode(
2842 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2843 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002844 return v;
2845 }
2846
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002847 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002848 PyErr_SetString(PyExc_TypeError,
2849 "decoding str is not supported");
2850 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002851 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002852
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002853 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2854 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2855 PyErr_Format(PyExc_TypeError,
2856 "coercing to str: need bytes, bytearray "
2857 "or buffer-like object, %.80s found",
2858 Py_TYPE(obj)->tp_name);
2859 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002860 }
Tim Petersced69f82003-09-16 20:30:58 +00002861
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002862 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002863 PyBuffer_Release(&buffer);
2864 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00002865 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002866
Serhiy Storchaka05997252013-01-26 12:14:02 +02002867 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002868 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002869 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002870}
2871
Victor Stinner600d3be2010-06-10 12:00:55 +00002872/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002873 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2874 1 on success. */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01002875int
2876_Py_normalize_encoding(const char *encoding,
2877 char *lower,
2878 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002879{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002880 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002881 char *l;
2882 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002883
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002884 if (encoding == NULL) {
2885 strcpy(lower, "utf-8");
2886 return 1;
2887 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002888 e = encoding;
2889 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002890 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002891 while (*e) {
2892 if (l == l_end)
2893 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002894 if (Py_ISUPPER(*e)) {
2895 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002896 }
2897 else if (*e == '_') {
2898 *l++ = '-';
2899 e++;
2900 }
2901 else {
2902 *l++ = *e++;
2903 }
2904 }
2905 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002906 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002907}
2908
Alexander Belopolsky40018472011-02-26 01:02:56 +00002909PyObject *
2910PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002911 Py_ssize_t size,
2912 const char *encoding,
2913 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002914{
2915 PyObject *buffer = NULL, *unicode;
2916 Py_buffer info;
2917 char lower[11]; /* Enough for any encoding shortcut */
2918
Fred Drakee4315f52000-05-09 19:53:39 +00002919 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01002920 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002921 if ((strcmp(lower, "utf-8") == 0) ||
2922 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002923 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00002924 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002925 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002926 (strcmp(lower, "iso-8859-1") == 0))
2927 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002928#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002929 else if (strcmp(lower, "mbcs") == 0)
2930 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002931#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002932 else if (strcmp(lower, "ascii") == 0)
2933 return PyUnicode_DecodeASCII(s, size, errors);
2934 else if (strcmp(lower, "utf-16") == 0)
2935 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2936 else if (strcmp(lower, "utf-32") == 0)
2937 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2938 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002939
2940 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002941 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002942 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002943 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002944 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002945 if (buffer == NULL)
2946 goto onError;
2947 unicode = PyCodec_Decode(buffer, encoding, errors);
2948 if (unicode == NULL)
2949 goto onError;
2950 if (!PyUnicode_Check(unicode)) {
2951 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002952 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002953 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002954 Py_DECREF(unicode);
2955 goto onError;
2956 }
2957 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002958 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00002959
Benjamin Peterson29060642009-01-31 22:14:21 +00002960 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002961 Py_XDECREF(buffer);
2962 return NULL;
2963}
2964
Alexander Belopolsky40018472011-02-26 01:02:56 +00002965PyObject *
2966PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002967 const char *encoding,
2968 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002969{
2970 PyObject *v;
2971
2972 if (!PyUnicode_Check(unicode)) {
2973 PyErr_BadArgument();
2974 goto onError;
2975 }
2976
2977 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002978 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002979
2980 /* Decode via the codec registry */
2981 v = PyCodec_Decode(unicode, encoding, errors);
2982 if (v == NULL)
2983 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002984 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002985
Benjamin Peterson29060642009-01-31 22:14:21 +00002986 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002987 return NULL;
2988}
2989
Alexander Belopolsky40018472011-02-26 01:02:56 +00002990PyObject *
2991PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002992 const char *encoding,
2993 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002994{
2995 PyObject *v;
2996
2997 if (!PyUnicode_Check(unicode)) {
2998 PyErr_BadArgument();
2999 goto onError;
3000 }
3001
3002 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003003 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003004
3005 /* Decode via the codec registry */
3006 v = PyCodec_Decode(unicode, encoding, errors);
3007 if (v == NULL)
3008 goto onError;
3009 if (!PyUnicode_Check(v)) {
3010 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003011 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003012 Py_TYPE(v)->tp_name);
3013 Py_DECREF(v);
3014 goto onError;
3015 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003016 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003017
Benjamin Peterson29060642009-01-31 22:14:21 +00003018 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003019 return NULL;
3020}
3021
Alexander Belopolsky40018472011-02-26 01:02:56 +00003022PyObject *
3023PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003024 Py_ssize_t size,
3025 const char *encoding,
3026 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003027{
3028 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003029
Guido van Rossumd57fd912000-03-10 22:53:23 +00003030 unicode = PyUnicode_FromUnicode(s, size);
3031 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003032 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003033 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3034 Py_DECREF(unicode);
3035 return v;
3036}
3037
Alexander Belopolsky40018472011-02-26 01:02:56 +00003038PyObject *
3039PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003040 const char *encoding,
3041 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003042{
3043 PyObject *v;
3044
3045 if (!PyUnicode_Check(unicode)) {
3046 PyErr_BadArgument();
3047 goto onError;
3048 }
3049
3050 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003051 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003052
3053 /* Encode via the codec registry */
3054 v = PyCodec_Encode(unicode, encoding, errors);
3055 if (v == NULL)
3056 goto onError;
3057 return v;
3058
Benjamin Peterson29060642009-01-31 22:14:21 +00003059 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003060 return NULL;
3061}
3062
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003063static size_t
3064wcstombs_errorpos(const wchar_t *wstr)
3065{
3066 size_t len;
3067#if SIZEOF_WCHAR_T == 2
3068 wchar_t buf[3];
3069#else
3070 wchar_t buf[2];
3071#endif
3072 char outbuf[MB_LEN_MAX];
3073 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003074
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003075#if SIZEOF_WCHAR_T == 2
3076 buf[2] = 0;
3077#else
3078 buf[1] = 0;
3079#endif
3080 start = wstr;
3081 while (*wstr != L'\0')
3082 {
3083 previous = wstr;
3084#if SIZEOF_WCHAR_T == 2
3085 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3086 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3087 {
3088 buf[0] = wstr[0];
3089 buf[1] = wstr[1];
3090 wstr += 2;
3091 }
3092 else {
3093 buf[0] = *wstr;
3094 buf[1] = 0;
3095 wstr++;
3096 }
3097#else
3098 buf[0] = *wstr;
3099 wstr++;
3100#endif
3101 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003102 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003103 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003104 }
3105
3106 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003107 return 0;
3108}
3109
Victor Stinner1b579672011-12-17 05:47:23 +01003110static int
3111locale_error_handler(const char *errors, int *surrogateescape)
3112{
3113 if (errors == NULL) {
3114 *surrogateescape = 0;
3115 return 0;
3116 }
3117
3118 if (strcmp(errors, "strict") == 0) {
3119 *surrogateescape = 0;
3120 return 0;
3121 }
Victor Stinner8dbd4212012-12-04 09:30:24 +01003122 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner1b579672011-12-17 05:47:23 +01003123 *surrogateescape = 1;
3124 return 0;
3125 }
3126 PyErr_Format(PyExc_ValueError,
3127 "only 'strict' and 'surrogateescape' error handlers "
3128 "are supported, not '%s'",
3129 errors);
3130 return -1;
3131}
3132
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003133PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003134PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003135{
3136 Py_ssize_t wlen, wlen2;
3137 wchar_t *wstr;
3138 PyObject *bytes = NULL;
3139 char *errmsg;
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003140 PyObject *reason;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003141 PyObject *exc;
3142 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003143 int surrogateescape;
3144
3145 if (locale_error_handler(errors, &surrogateescape) < 0)
3146 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003147
3148 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3149 if (wstr == NULL)
3150 return NULL;
3151
3152 wlen2 = wcslen(wstr);
3153 if (wlen2 != wlen) {
3154 PyMem_Free(wstr);
3155 PyErr_SetString(PyExc_TypeError, "embedded null character");
3156 return NULL;
3157 }
3158
3159 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003160 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003161 char *str;
3162
3163 str = _Py_wchar2char(wstr, &error_pos);
3164 if (str == NULL) {
3165 if (error_pos == (size_t)-1) {
3166 PyErr_NoMemory();
3167 PyMem_Free(wstr);
3168 return NULL;
3169 }
3170 else {
3171 goto encode_error;
3172 }
3173 }
3174 PyMem_Free(wstr);
3175
3176 bytes = PyBytes_FromString(str);
3177 PyMem_Free(str);
3178 }
3179 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003180 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003181 size_t len, len2;
3182
3183 len = wcstombs(NULL, wstr, 0);
3184 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003185 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003186 goto encode_error;
3187 }
3188
3189 bytes = PyBytes_FromStringAndSize(NULL, len);
3190 if (bytes == NULL) {
3191 PyMem_Free(wstr);
3192 return NULL;
3193 }
3194
3195 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3196 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003197 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003198 goto encode_error;
3199 }
3200 PyMem_Free(wstr);
3201 }
3202 return bytes;
3203
3204encode_error:
3205 errmsg = strerror(errno);
3206 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003207
3208 if (error_pos == (size_t)-1)
3209 error_pos = wcstombs_errorpos(wstr);
3210
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003211 PyMem_Free(wstr);
3212 Py_XDECREF(bytes);
3213
Victor Stinner2f197072011-12-17 07:08:30 +01003214 if (errmsg != NULL) {
3215 size_t errlen;
3216 wstr = _Py_char2wchar(errmsg, &errlen);
3217 if (wstr != NULL) {
3218 reason = PyUnicode_FromWideChar(wstr, errlen);
3219 PyMem_Free(wstr);
3220 } else
3221 errmsg = NULL;
3222 }
3223 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003224 reason = PyUnicode_FromString(
3225 "wcstombs() encountered an unencodable "
3226 "wide character");
3227 if (reason == NULL)
3228 return NULL;
3229
3230 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3231 "locale", unicode,
3232 (Py_ssize_t)error_pos,
3233 (Py_ssize_t)(error_pos+1),
3234 reason);
3235 Py_DECREF(reason);
3236 if (exc != NULL) {
3237 PyCodec_StrictErrors(exc);
3238 Py_XDECREF(exc);
3239 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003240 return NULL;
3241}
3242
Victor Stinnerad158722010-10-27 00:25:46 +00003243PyObject *
3244PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003245{
Victor Stinner99b95382011-07-04 14:23:54 +02003246#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003247 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003248#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003249 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003250#else
Victor Stinner793b5312011-04-27 00:24:21 +02003251 PyInterpreterState *interp = PyThreadState_GET()->interp;
3252 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3253 cannot use it to encode and decode filenames before it is loaded. Load
3254 the Python codec requires to encode at least its own filename. Use the C
3255 version of the locale codec until the codec registry is initialized and
3256 the Python codec is loaded.
3257
3258 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3259 cannot only rely on it: check also interp->fscodec_initialized for
3260 subinterpreters. */
3261 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003262 return PyUnicode_AsEncodedString(unicode,
3263 Py_FileSystemDefaultEncoding,
3264 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003265 }
3266 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003267 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003268 }
Victor Stinnerad158722010-10-27 00:25:46 +00003269#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003270}
3271
Alexander Belopolsky40018472011-02-26 01:02:56 +00003272PyObject *
3273PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003274 const char *encoding,
3275 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003276{
3277 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003278 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003279
Guido van Rossumd57fd912000-03-10 22:53:23 +00003280 if (!PyUnicode_Check(unicode)) {
3281 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003282 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003283 }
Fred Drakee4315f52000-05-09 19:53:39 +00003284
Fred Drakee4315f52000-05-09 19:53:39 +00003285 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003286 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003287 if ((strcmp(lower, "utf-8") == 0) ||
3288 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003289 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003290 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003291 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003292 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003293 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003294 }
Victor Stinner37296e82010-06-10 13:36:23 +00003295 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003296 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003297 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003298 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003299#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003300 else if (strcmp(lower, "mbcs") == 0)
3301 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003302#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003303 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003304 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003305 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003306
3307 /* Encode via the codec registry */
3308 v = PyCodec_Encode(unicode, encoding, errors);
3309 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003310 return NULL;
3311
3312 /* The normal path */
3313 if (PyBytes_Check(v))
3314 return v;
3315
3316 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003317 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003318 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003319 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003320
3321 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3322 "encoder %s returned bytearray instead of bytes",
3323 encoding);
3324 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003325 Py_DECREF(v);
3326 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003327 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003328
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003329 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3330 Py_DECREF(v);
3331 return b;
3332 }
3333
3334 PyErr_Format(PyExc_TypeError,
3335 "encoder did not return a bytes object (type=%.400s)",
3336 Py_TYPE(v)->tp_name);
3337 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003338 return NULL;
3339}
3340
Alexander Belopolsky40018472011-02-26 01:02:56 +00003341PyObject *
3342PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003343 const char *encoding,
3344 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003345{
3346 PyObject *v;
3347
3348 if (!PyUnicode_Check(unicode)) {
3349 PyErr_BadArgument();
3350 goto onError;
3351 }
3352
3353 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003354 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003355
3356 /* Encode via the codec registry */
3357 v = PyCodec_Encode(unicode, encoding, errors);
3358 if (v == NULL)
3359 goto onError;
3360 if (!PyUnicode_Check(v)) {
3361 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003362 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003363 Py_TYPE(v)->tp_name);
3364 Py_DECREF(v);
3365 goto onError;
3366 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003367 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003368
Benjamin Peterson29060642009-01-31 22:14:21 +00003369 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003370 return NULL;
3371}
3372
Victor Stinner2f197072011-12-17 07:08:30 +01003373static size_t
3374mbstowcs_errorpos(const char *str, size_t len)
3375{
3376#ifdef HAVE_MBRTOWC
3377 const char *start = str;
3378 mbstate_t mbs;
3379 size_t converted;
3380 wchar_t ch;
3381
3382 memset(&mbs, 0, sizeof mbs);
3383 while (len)
3384 {
3385 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3386 if (converted == 0)
3387 /* Reached end of string */
3388 break;
3389 if (converted == (size_t)-1 || converted == (size_t)-2) {
3390 /* Conversion error or incomplete character */
3391 return str - start;
3392 }
3393 else {
3394 str += converted;
3395 len -= converted;
3396 }
3397 }
3398 /* failed to find the undecodable byte sequence */
3399 return 0;
3400#endif
3401 return 0;
3402}
3403
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003404PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003405PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003406 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003407{
3408 wchar_t smallbuf[256];
3409 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3410 wchar_t *wstr;
3411 size_t wlen, wlen2;
3412 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003413 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003414 size_t error_pos;
3415 char *errmsg;
3416 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003417
3418 if (locale_error_handler(errors, &surrogateescape) < 0)
3419 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003420
3421 if (str[len] != '\0' || len != strlen(str)) {
3422 PyErr_SetString(PyExc_TypeError, "embedded null character");
3423 return NULL;
3424 }
3425
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003426 if (surrogateescape) {
3427 /* "surrogateescape" error handler */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003428 wstr = _Py_char2wchar(str, &wlen);
3429 if (wstr == NULL) {
3430 if (wlen == (size_t)-1)
3431 PyErr_NoMemory();
3432 else
3433 PyErr_SetFromErrno(PyExc_OSError);
3434 return NULL;
3435 }
3436
3437 unicode = PyUnicode_FromWideChar(wstr, wlen);
3438 PyMem_Free(wstr);
3439 }
3440 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003441 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003442#ifndef HAVE_BROKEN_MBSTOWCS
3443 wlen = mbstowcs(NULL, str, 0);
3444#else
3445 wlen = len;
3446#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003447 if (wlen == (size_t)-1)
3448 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003449 if (wlen+1 <= smallbuf_len) {
3450 wstr = smallbuf;
3451 }
3452 else {
3453 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3454 return PyErr_NoMemory();
3455
3456 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3457 if (!wstr)
3458 return PyErr_NoMemory();
3459 }
3460
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003461 wlen2 = mbstowcs(wstr, str, wlen+1);
3462 if (wlen2 == (size_t)-1) {
3463 if (wstr != smallbuf)
3464 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003465 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003466 }
3467#ifdef HAVE_BROKEN_MBSTOWCS
3468 assert(wlen2 == wlen);
3469#endif
3470 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3471 if (wstr != smallbuf)
3472 PyMem_Free(wstr);
3473 }
3474 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003475
3476decode_error:
3477 errmsg = strerror(errno);
3478 assert(errmsg != NULL);
3479
3480 error_pos = mbstowcs_errorpos(str, len);
3481 if (errmsg != NULL) {
3482 size_t errlen;
3483 wstr = _Py_char2wchar(errmsg, &errlen);
3484 if (wstr != NULL) {
3485 reason = PyUnicode_FromWideChar(wstr, errlen);
3486 PyMem_Free(wstr);
3487 } else
3488 errmsg = NULL;
3489 }
3490 if (errmsg == NULL)
3491 reason = PyUnicode_FromString(
3492 "mbstowcs() encountered an invalid multibyte sequence");
3493 if (reason == NULL)
3494 return NULL;
3495
3496 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3497 "locale", str, len,
3498 (Py_ssize_t)error_pos,
3499 (Py_ssize_t)(error_pos+1),
3500 reason);
3501 Py_DECREF(reason);
3502 if (exc != NULL) {
3503 PyCodec_StrictErrors(exc);
3504 Py_XDECREF(exc);
3505 }
3506 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003507}
3508
3509PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003510PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003511{
3512 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003513 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003514}
3515
3516
3517PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003518PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003519 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003520 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3521}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003522
Christian Heimes5894ba72007-11-04 11:43:14 +00003523PyObject*
3524PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3525{
Victor Stinner99b95382011-07-04 14:23:54 +02003526#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003527 return PyUnicode_DecodeMBCS(s, size, NULL);
3528#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003529 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003530#else
Victor Stinner793b5312011-04-27 00:24:21 +02003531 PyInterpreterState *interp = PyThreadState_GET()->interp;
3532 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3533 cannot use it to encode and decode filenames before it is loaded. Load
3534 the Python codec requires to encode at least its own filename. Use the C
3535 version of the locale codec until the codec registry is initialized and
3536 the Python codec is loaded.
3537
3538 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3539 cannot only rely on it: check also interp->fscodec_initialized for
3540 subinterpreters. */
3541 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003542 return PyUnicode_Decode(s, size,
3543 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003544 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003545 }
3546 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003547 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003548 }
Victor Stinnerad158722010-10-27 00:25:46 +00003549#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003550}
3551
Martin v. Löwis011e8422009-05-05 04:43:17 +00003552
3553int
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003554_PyUnicode_HasNULChars(PyObject* str)
Antoine Pitrou13348842012-01-29 18:36:34 +01003555{
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003556 Py_ssize_t pos;
Antoine Pitrou13348842012-01-29 18:36:34 +01003557
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003558 if (PyUnicode_READY(str) == -1)
Antoine Pitrou13348842012-01-29 18:36:34 +01003559 return -1;
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003560 pos = findchar(PyUnicode_DATA(str), PyUnicode_KIND(str),
3561 PyUnicode_GET_LENGTH(str), '\0', 1);
3562 if (pos == -1)
3563 return 0;
3564 else
3565 return 1;
Antoine Pitrou13348842012-01-29 18:36:34 +01003566}
3567
Antoine Pitrou13348842012-01-29 18:36:34 +01003568int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003569PyUnicode_FSConverter(PyObject* arg, void* addr)
3570{
3571 PyObject *output = NULL;
3572 Py_ssize_t size;
3573 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003574 if (arg == NULL) {
3575 Py_DECREF(*(PyObject**)addr);
3576 return 1;
3577 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003578 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003579 output = arg;
3580 Py_INCREF(output);
3581 }
3582 else {
3583 arg = PyUnicode_FromObject(arg);
3584 if (!arg)
3585 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003586 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003587 Py_DECREF(arg);
3588 if (!output)
3589 return 0;
3590 if (!PyBytes_Check(output)) {
3591 Py_DECREF(output);
3592 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3593 return 0;
3594 }
3595 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003596 size = PyBytes_GET_SIZE(output);
3597 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003598 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003599 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003600 Py_DECREF(output);
3601 return 0;
3602 }
3603 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003604 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003605}
3606
3607
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003608int
3609PyUnicode_FSDecoder(PyObject* arg, void* addr)
3610{
3611 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003612 if (arg == NULL) {
3613 Py_DECREF(*(PyObject**)addr);
3614 return 1;
3615 }
3616 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003617 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003618 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003619 output = arg;
3620 Py_INCREF(output);
3621 }
3622 else {
3623 arg = PyBytes_FromObject(arg);
3624 if (!arg)
3625 return 0;
3626 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3627 PyBytes_GET_SIZE(arg));
3628 Py_DECREF(arg);
3629 if (!output)
3630 return 0;
3631 if (!PyUnicode_Check(output)) {
3632 Py_DECREF(output);
3633 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3634 return 0;
3635 }
3636 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003637 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003638 Py_DECREF(output);
3639 return 0;
3640 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003641 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003642 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003643 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3644 Py_DECREF(output);
3645 return 0;
3646 }
3647 *(PyObject**)addr = output;
3648 return Py_CLEANUP_SUPPORTED;
3649}
3650
3651
Martin v. Löwis5b222132007-06-10 09:51:05 +00003652char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003653PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003654{
Christian Heimesf3863112007-11-22 07:46:41 +00003655 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003656
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003657 if (!PyUnicode_Check(unicode)) {
3658 PyErr_BadArgument();
3659 return NULL;
3660 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003661 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003662 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003663
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003664 if (PyUnicode_UTF8(unicode) == NULL) {
3665 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003666 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3667 if (bytes == NULL)
3668 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003669 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3670 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003671 Py_DECREF(bytes);
3672 return NULL;
3673 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003674 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3675 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3676 PyBytes_AS_STRING(bytes),
3677 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003678 Py_DECREF(bytes);
3679 }
3680
3681 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003682 *psize = PyUnicode_UTF8_LENGTH(unicode);
3683 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003684}
3685
3686char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003687PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003688{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003689 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3690}
3691
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003692Py_UNICODE *
3693PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3694{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003695 const unsigned char *one_byte;
3696#if SIZEOF_WCHAR_T == 4
3697 const Py_UCS2 *two_bytes;
3698#else
3699 const Py_UCS4 *four_bytes;
3700 const Py_UCS4 *ucs4_end;
3701 Py_ssize_t num_surrogates;
3702#endif
3703 wchar_t *w;
3704 wchar_t *wchar_end;
3705
3706 if (!PyUnicode_Check(unicode)) {
3707 PyErr_BadArgument();
3708 return NULL;
3709 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003710 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003711 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003712 assert(_PyUnicode_KIND(unicode) != 0);
3713 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003714
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003715 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003716#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003717 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3718 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003719 num_surrogates = 0;
3720
3721 for (; four_bytes < ucs4_end; ++four_bytes) {
3722 if (*four_bytes > 0xFFFF)
3723 ++num_surrogates;
3724 }
3725
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003726 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3727 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3728 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003729 PyErr_NoMemory();
3730 return NULL;
3731 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003732 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003733
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003734 w = _PyUnicode_WSTR(unicode);
3735 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3736 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003737 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3738 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003739 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003740 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003741 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3742 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003743 }
3744 else
3745 *w = *four_bytes;
3746
3747 if (w > wchar_end) {
3748 assert(0 && "Miscalculated string end");
3749 }
3750 }
3751 *w = 0;
3752#else
3753 /* sizeof(wchar_t) == 4 */
3754 Py_FatalError("Impossible unicode object state, wstr and str "
3755 "should share memory already.");
3756 return NULL;
3757#endif
3758 }
3759 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003760 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3761 (_PyUnicode_LENGTH(unicode) + 1));
3762 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003763 PyErr_NoMemory();
3764 return NULL;
3765 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003766 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3767 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3768 w = _PyUnicode_WSTR(unicode);
3769 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003770
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003771 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3772 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003773 for (; w < wchar_end; ++one_byte, ++w)
3774 *w = *one_byte;
3775 /* null-terminate the wstr */
3776 *w = 0;
3777 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003778 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003779#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003780 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003781 for (; w < wchar_end; ++two_bytes, ++w)
3782 *w = *two_bytes;
3783 /* null-terminate the wstr */
3784 *w = 0;
3785#else
3786 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003787 PyObject_FREE(_PyUnicode_WSTR(unicode));
3788 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003789 Py_FatalError("Impossible unicode object state, wstr "
3790 "and str should share memory already.");
3791 return NULL;
3792#endif
3793 }
3794 else {
3795 assert(0 && "This should never happen.");
3796 }
3797 }
3798 }
3799 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003800 *size = PyUnicode_WSTR_LENGTH(unicode);
3801 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003802}
3803
Alexander Belopolsky40018472011-02-26 01:02:56 +00003804Py_UNICODE *
3805PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003806{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003807 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003808}
3809
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003810
Alexander Belopolsky40018472011-02-26 01:02:56 +00003811Py_ssize_t
3812PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003813{
3814 if (!PyUnicode_Check(unicode)) {
3815 PyErr_BadArgument();
3816 goto onError;
3817 }
3818 return PyUnicode_GET_SIZE(unicode);
3819
Benjamin Peterson29060642009-01-31 22:14:21 +00003820 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003821 return -1;
3822}
3823
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003824Py_ssize_t
3825PyUnicode_GetLength(PyObject *unicode)
3826{
Victor Stinner07621332012-06-16 04:53:46 +02003827 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003828 PyErr_BadArgument();
3829 return -1;
3830 }
Victor Stinner07621332012-06-16 04:53:46 +02003831 if (PyUnicode_READY(unicode) == -1)
3832 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003833 return PyUnicode_GET_LENGTH(unicode);
3834}
3835
3836Py_UCS4
3837PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3838{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003839 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3840 PyErr_BadArgument();
3841 return (Py_UCS4)-1;
3842 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003843 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003844 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003845 return (Py_UCS4)-1;
3846 }
3847 return PyUnicode_READ_CHAR(unicode, index);
3848}
3849
3850int
3851PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3852{
3853 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003854 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003855 return -1;
3856 }
Victor Stinner488fa492011-12-12 00:01:39 +01003857 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003858 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003859 PyErr_SetString(PyExc_IndexError, "string index out of range");
3860 return -1;
3861 }
Victor Stinner488fa492011-12-12 00:01:39 +01003862 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003863 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01003864 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3865 PyErr_SetString(PyExc_ValueError, "character out of range");
3866 return -1;
3867 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003868 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3869 index, ch);
3870 return 0;
3871}
3872
Alexander Belopolsky40018472011-02-26 01:02:56 +00003873const char *
3874PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003875{
Victor Stinner42cb4622010-09-01 19:39:01 +00003876 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003877}
3878
Victor Stinner554f3f02010-06-16 23:33:54 +00003879/* create or adjust a UnicodeDecodeError */
3880static void
3881make_decode_exception(PyObject **exceptionObject,
3882 const char *encoding,
3883 const char *input, Py_ssize_t length,
3884 Py_ssize_t startpos, Py_ssize_t endpos,
3885 const char *reason)
3886{
3887 if (*exceptionObject == NULL) {
3888 *exceptionObject = PyUnicodeDecodeError_Create(
3889 encoding, input, length, startpos, endpos, reason);
3890 }
3891 else {
3892 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3893 goto onError;
3894 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3895 goto onError;
3896 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3897 goto onError;
3898 }
3899 return;
3900
3901onError:
3902 Py_DECREF(*exceptionObject);
3903 *exceptionObject = NULL;
3904}
3905
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003906#ifdef HAVE_MBCS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003907/* error handling callback helper:
3908 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003909 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003910 and adjust various state variables.
3911 return 0 on success, -1 on error
3912*/
3913
Alexander Belopolsky40018472011-02-26 01:02:56 +00003914static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003915unicode_decode_call_errorhandler_wchar(
3916 const char *errors, PyObject **errorHandler,
3917 const char *encoding, const char *reason,
3918 const char **input, const char **inend, Py_ssize_t *startinpos,
3919 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3920 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003921{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003922 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003923
3924 PyObject *restuple = NULL;
3925 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01003926 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003927 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003928 Py_ssize_t requiredsize;
3929 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003930 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003931 wchar_t *repwstr;
3932 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003933
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003934 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
3935 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01003936
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003937 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003938 *errorHandler = PyCodec_LookupError(errors);
3939 if (*errorHandler == NULL)
3940 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003941 }
3942
Victor Stinner554f3f02010-06-16 23:33:54 +00003943 make_decode_exception(exceptionObject,
3944 encoding,
3945 *input, *inend - *input,
3946 *startinpos, *endinpos,
3947 reason);
3948 if (*exceptionObject == NULL)
3949 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003950
3951 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3952 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003953 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003954 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003955 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003956 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003957 }
3958 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003959 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003960
3961 /* Copy back the bytes variables, which might have been modified by the
3962 callback */
3963 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3964 if (!inputobj)
3965 goto onError;
3966 if (!PyBytes_Check(inputobj)) {
3967 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
3968 }
3969 *input = PyBytes_AS_STRING(inputobj);
3970 insize = PyBytes_GET_SIZE(inputobj);
3971 *inend = *input + insize;
3972 /* we can DECREF safely, as the exception has another reference,
3973 so the object won't go away. */
3974 Py_DECREF(inputobj);
3975
3976 if (newpos<0)
3977 newpos = insize+newpos;
3978 if (newpos<0 || newpos>insize) {
3979 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3980 goto onError;
3981 }
3982
3983 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
3984 if (repwstr == NULL)
3985 goto onError;
3986 /* need more space? (at least enough for what we
3987 have+the replacement+the rest of the string (starting
3988 at the new input position), so we won't have to check space
3989 when there are no errors in the rest of the string) */
3990 requiredsize = *outpos + repwlen + insize-newpos;
3991 if (requiredsize > outsize) {
3992 if (requiredsize < 2*outsize)
3993 requiredsize = 2*outsize;
3994 if (unicode_resize(output, requiredsize) < 0)
3995 goto onError;
3996 }
3997 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
3998 *outpos += repwlen;
3999
4000 *endinpos = newpos;
4001 *inptr = *input + newpos;
4002
4003 /* we made it! */
4004 Py_XDECREF(restuple);
4005 return 0;
4006
4007 onError:
4008 Py_XDECREF(restuple);
4009 return -1;
4010}
4011#endif /* HAVE_MBCS */
4012
4013static int
4014unicode_decode_call_errorhandler_writer(
4015 const char *errors, PyObject **errorHandler,
4016 const char *encoding, const char *reason,
4017 const char **input, const char **inend, Py_ssize_t *startinpos,
4018 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4019 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4020{
4021 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4022
4023 PyObject *restuple = NULL;
4024 PyObject *repunicode = NULL;
4025 Py_ssize_t insize;
4026 Py_ssize_t newpos;
4027 PyObject *inputobj = NULL;
4028
4029 if (*errorHandler == NULL) {
4030 *errorHandler = PyCodec_LookupError(errors);
4031 if (*errorHandler == NULL)
4032 goto onError;
4033 }
4034
4035 make_decode_exception(exceptionObject,
4036 encoding,
4037 *input, *inend - *input,
4038 *startinpos, *endinpos,
4039 reason);
4040 if (*exceptionObject == NULL)
4041 goto onError;
4042
4043 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4044 if (restuple == NULL)
4045 goto onError;
4046 if (!PyTuple_Check(restuple)) {
4047 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4048 goto onError;
4049 }
4050 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004051 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004052
4053 /* Copy back the bytes variables, which might have been modified by the
4054 callback */
4055 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4056 if (!inputobj)
4057 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004058 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004059 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004060 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004061 *input = PyBytes_AS_STRING(inputobj);
4062 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004063 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004064 /* we can DECREF safely, as the exception has another reference,
4065 so the object won't go away. */
4066 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004067
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004068 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004069 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004070 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004071 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4072 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004073 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004074
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004075 writer->overallocate = 1;
4076 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4077 return
4078
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004079 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004080 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004081
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004082 /* we made it! */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004083 Py_XDECREF(restuple);
4084 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004085
Benjamin Peterson29060642009-01-31 22:14:21 +00004086 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004087 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004088 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004089}
4090
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004091/* --- UTF-7 Codec -------------------------------------------------------- */
4092
Antoine Pitrou244651a2009-05-04 18:56:13 +00004093/* See RFC2152 for details. We encode conservatively and decode liberally. */
4094
4095/* Three simple macros defining base-64. */
4096
4097/* Is c a base-64 character? */
4098
4099#define IS_BASE64(c) \
4100 (((c) >= 'A' && (c) <= 'Z') || \
4101 ((c) >= 'a' && (c) <= 'z') || \
4102 ((c) >= '0' && (c) <= '9') || \
4103 (c) == '+' || (c) == '/')
4104
4105/* given that c is a base-64 character, what is its base-64 value? */
4106
4107#define FROM_BASE64(c) \
4108 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4109 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4110 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4111 (c) == '+' ? 62 : 63)
4112
4113/* What is the base-64 character of the bottom 6 bits of n? */
4114
4115#define TO_BASE64(n) \
4116 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4117
4118/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4119 * decoded as itself. We are permissive on decoding; the only ASCII
4120 * byte not decoding to itself is the + which begins a base64
4121 * string. */
4122
4123#define DECODE_DIRECT(c) \
4124 ((c) <= 127 && (c) != '+')
4125
4126/* The UTF-7 encoder treats ASCII characters differently according to
4127 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4128 * the above). See RFC2152. This array identifies these different
4129 * sets:
4130 * 0 : "Set D"
4131 * alphanumeric and '(),-./:?
4132 * 1 : "Set O"
4133 * !"#$%&*;<=>@[]^_`{|}
4134 * 2 : "whitespace"
4135 * ht nl cr sp
4136 * 3 : special (must be base64 encoded)
4137 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4138 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004139
Tim Petersced69f82003-09-16 20:30:58 +00004140static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004141char utf7_category[128] = {
4142/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4143 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4144/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4145 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4146/* sp ! " # $ % & ' ( ) * + , - . / */
4147 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4148/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4149 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4150/* @ A B C D E F G H I J K L M N O */
4151 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4152/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4153 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4154/* ` a b c d e f g h i j k l m n o */
4155 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4156/* p q r s t u v w x y z { | } ~ del */
4157 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004158};
4159
Antoine Pitrou244651a2009-05-04 18:56:13 +00004160/* ENCODE_DIRECT: this character should be encoded as itself. The
4161 * answer depends on whether we are encoding set O as itself, and also
4162 * on whether we are encoding whitespace as itself. RFC2152 makes it
4163 * clear that the answers to these questions vary between
4164 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004165
Antoine Pitrou244651a2009-05-04 18:56:13 +00004166#define ENCODE_DIRECT(c, directO, directWS) \
4167 ((c) < 128 && (c) > 0 && \
4168 ((utf7_category[(c)] == 0) || \
4169 (directWS && (utf7_category[(c)] == 2)) || \
4170 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004171
Alexander Belopolsky40018472011-02-26 01:02:56 +00004172PyObject *
4173PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004174 Py_ssize_t size,
4175 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004176{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004177 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4178}
4179
Antoine Pitrou244651a2009-05-04 18:56:13 +00004180/* The decoder. The only state we preserve is our read position,
4181 * i.e. how many characters we have consumed. So if we end in the
4182 * middle of a shift sequence we have to back off the read position
4183 * and the output to the beginning of the sequence, otherwise we lose
4184 * all the shift state (seen bits, number of bits seen, high
4185 * surrogate). */
4186
Alexander Belopolsky40018472011-02-26 01:02:56 +00004187PyObject *
4188PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004189 Py_ssize_t size,
4190 const char *errors,
4191 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004192{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004193 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004194 Py_ssize_t startinpos;
4195 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004196 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004197 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004198 const char *errmsg = "";
4199 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004200 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004201 unsigned int base64bits = 0;
4202 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004203 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004204 PyObject *errorHandler = NULL;
4205 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004206
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004207 if (size == 0) {
4208 if (consumed)
4209 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004210 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004211 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004212
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004213 /* Start off assuming it's all ASCII. Widen later as necessary. */
4214 _PyUnicodeWriter_Init(&writer, 0);
4215 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
4216 goto onError;
4217
4218 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004219 e = s + size;
4220
4221 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004222 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004223 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004224 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004225
Antoine Pitrou244651a2009-05-04 18:56:13 +00004226 if (inShift) { /* in a base-64 section */
4227 if (IS_BASE64(ch)) { /* consume a base-64 character */
4228 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4229 base64bits += 6;
4230 s++;
4231 if (base64bits >= 16) {
4232 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004233 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004234 base64bits -= 16;
4235 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4236 if (surrogate) {
4237 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004238 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4239 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004240 if (_PyUnicodeWriter_Prepare(&writer, 1, ch2) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004241 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004242 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch2);
4243 writer.pos++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004244 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004245 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004246 }
4247 else {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004248 if (_PyUnicodeWriter_Prepare(&writer, 1, surrogate) == -1)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004249 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004250 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, surrogate);
4251 writer.pos++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004252 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004253 }
4254 }
Victor Stinner551ac952011-11-29 22:58:13 +01004255 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004256 /* first surrogate */
4257 surrogate = outCh;
4258 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004259 else {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004260 if (_PyUnicodeWriter_Prepare(&writer, 1, outCh) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004261 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004262 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, outCh);
4263 writer.pos++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004264 }
4265 }
4266 }
4267 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004268 inShift = 0;
4269 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004270 if (surrogate) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004271 if (_PyUnicodeWriter_Prepare(&writer, 1, surrogate) == -1)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004272 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004273 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, surrogate);
4274 writer.pos++;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004275 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004276 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004277 if (base64bits > 0) { /* left-over bits */
4278 if (base64bits >= 6) {
4279 /* We've seen at least one base-64 character */
4280 errmsg = "partial character in shift sequence";
4281 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004282 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004283 else {
4284 /* Some bits remain; they should be zero */
4285 if (base64buffer != 0) {
4286 errmsg = "non-zero padding bits in shift sequence";
4287 goto utf7Error;
4288 }
4289 }
4290 }
4291 if (ch != '-') {
4292 /* '-' is absorbed; other terminating
4293 characters are preserved */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004294 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004295 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004296 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
4297 writer.pos++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004298 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004299 }
4300 }
4301 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004302 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004303 s++; /* consume '+' */
4304 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004305 s++;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004306 if (_PyUnicodeWriter_Prepare(&writer, 1, '+') == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004307 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004308 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '+');
4309 writer.pos++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004310 }
4311 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004312 inShift = 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004313 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004314 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004315 }
4316 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004317 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004318 s++;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004319 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
4320 goto onError;
4321 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
4322 writer.pos++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004323 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004324 else {
4325 startinpos = s-starts;
4326 s++;
4327 errmsg = "unexpected special character";
4328 goto utf7Error;
4329 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004330 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004331utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004332 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004333 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004334 errors, &errorHandler,
4335 "utf7", errmsg,
4336 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004337 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004338 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004339 }
4340
Antoine Pitrou244651a2009-05-04 18:56:13 +00004341 /* end of string */
4342
4343 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4344 /* if we're in an inconsistent state, that's an error */
4345 if (surrogate ||
4346 (base64bits >= 6) ||
4347 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004348 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004349 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004350 errors, &errorHandler,
4351 "utf7", "unterminated shift sequence",
4352 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004353 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004354 goto onError;
4355 if (s < e)
4356 goto restart;
4357 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004358 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004359
4360 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004361 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004362 if (inShift) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004363 writer.pos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004364 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004365 }
4366 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004367 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004368 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004369 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004370
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004371 Py_XDECREF(errorHandler);
4372 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004373 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004374
Benjamin Peterson29060642009-01-31 22:14:21 +00004375 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004376 Py_XDECREF(errorHandler);
4377 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004378 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004379 return NULL;
4380}
4381
4382
Alexander Belopolsky40018472011-02-26 01:02:56 +00004383PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004384_PyUnicode_EncodeUTF7(PyObject *str,
4385 int base64SetO,
4386 int base64WhiteSpace,
4387 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004388{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004389 int kind;
4390 void *data;
4391 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004392 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004393 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004394 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004395 unsigned int base64bits = 0;
4396 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004397 char * out;
4398 char * start;
4399
Benjamin Petersonbac79492012-01-14 13:34:47 -05004400 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004401 return NULL;
4402 kind = PyUnicode_KIND(str);
4403 data = PyUnicode_DATA(str);
4404 len = PyUnicode_GET_LENGTH(str);
4405
4406 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004407 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004408
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004409 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004410 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004411 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004412 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004413 if (v == NULL)
4414 return NULL;
4415
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004416 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004417 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004418 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004419
Antoine Pitrou244651a2009-05-04 18:56:13 +00004420 if (inShift) {
4421 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4422 /* shifting out */
4423 if (base64bits) { /* output remaining bits */
4424 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4425 base64buffer = 0;
4426 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004427 }
4428 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004429 /* Characters not in the BASE64 set implicitly unshift the sequence
4430 so no '-' is required, except if the character is itself a '-' */
4431 if (IS_BASE64(ch) || ch == '-') {
4432 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004433 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004434 *out++ = (char) ch;
4435 }
4436 else {
4437 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004438 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004439 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004440 else { /* not in a shift sequence */
4441 if (ch == '+') {
4442 *out++ = '+';
4443 *out++ = '-';
4444 }
4445 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4446 *out++ = (char) ch;
4447 }
4448 else {
4449 *out++ = '+';
4450 inShift = 1;
4451 goto encode_char;
4452 }
4453 }
4454 continue;
4455encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004456 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004457 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004458
Antoine Pitrou244651a2009-05-04 18:56:13 +00004459 /* code first surrogate */
4460 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004461 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004462 while (base64bits >= 6) {
4463 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4464 base64bits -= 6;
4465 }
4466 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004467 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004468 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004469 base64bits += 16;
4470 base64buffer = (base64buffer << 16) | ch;
4471 while (base64bits >= 6) {
4472 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4473 base64bits -= 6;
4474 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004475 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004476 if (base64bits)
4477 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4478 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004479 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004480 if (_PyBytes_Resize(&v, out - start) < 0)
4481 return NULL;
4482 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004483}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004484PyObject *
4485PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4486 Py_ssize_t size,
4487 int base64SetO,
4488 int base64WhiteSpace,
4489 const char *errors)
4490{
4491 PyObject *result;
4492 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4493 if (tmp == NULL)
4494 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004495 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004496 base64WhiteSpace, errors);
4497 Py_DECREF(tmp);
4498 return result;
4499}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004500
Antoine Pitrou244651a2009-05-04 18:56:13 +00004501#undef IS_BASE64
4502#undef FROM_BASE64
4503#undef TO_BASE64
4504#undef DECODE_DIRECT
4505#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004506
Guido van Rossumd57fd912000-03-10 22:53:23 +00004507/* --- UTF-8 Codec -------------------------------------------------------- */
4508
Alexander Belopolsky40018472011-02-26 01:02:56 +00004509PyObject *
4510PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004511 Py_ssize_t size,
4512 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004513{
Walter Dörwald69652032004-09-07 20:24:22 +00004514 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4515}
4516
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004517#include "stringlib/asciilib.h"
4518#include "stringlib/codecs.h"
4519#include "stringlib/undef.h"
4520
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004521#include "stringlib/ucs1lib.h"
4522#include "stringlib/codecs.h"
4523#include "stringlib/undef.h"
4524
4525#include "stringlib/ucs2lib.h"
4526#include "stringlib/codecs.h"
4527#include "stringlib/undef.h"
4528
4529#include "stringlib/ucs4lib.h"
4530#include "stringlib/codecs.h"
4531#include "stringlib/undef.h"
4532
Antoine Pitrouab868312009-01-10 15:40:25 +00004533/* Mask to quickly check whether a C 'long' contains a
4534 non-ASCII, UTF8-encoded char. */
4535#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004536# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004537#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004538# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004539#else
4540# error C 'long' size should be either 4 or 8!
4541#endif
4542
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004543static Py_ssize_t
4544ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004545{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004546 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004547 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004548
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004549#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004550 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4551 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004552 /* Fast path, see in STRINGLIB(utf8_decode) for
4553 an explanation. */
4554 /* Help register allocation */
4555 register const char *_p = p;
4556 register Py_UCS1 * q = dest;
4557 while (_p < aligned_end) {
4558 unsigned long value = *(const unsigned long *) _p;
4559 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004560 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004561 *((unsigned long *)q) = value;
4562 _p += SIZEOF_LONG;
4563 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004564 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004565 p = _p;
4566 while (p < end) {
4567 if ((unsigned char)*p & 0x80)
4568 break;
4569 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004570 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004571 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004572 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004573#endif
4574 while (p < end) {
4575 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4576 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004577 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004578 /* Help register allocation */
4579 register const char *_p = p;
4580 while (_p < aligned_end) {
4581 unsigned long value = *(unsigned long *) _p;
4582 if (value & ASCII_CHAR_MASK)
4583 break;
4584 _p += SIZEOF_LONG;
4585 }
4586 p = _p;
4587 if (_p == end)
4588 break;
4589 }
4590 if ((unsigned char)*p & 0x80)
4591 break;
4592 ++p;
4593 }
4594 memcpy(dest, start, p - start);
4595 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004596}
Antoine Pitrouab868312009-01-10 15:40:25 +00004597
Victor Stinner785938e2011-12-11 20:09:03 +01004598PyObject *
4599PyUnicode_DecodeUTF8Stateful(const char *s,
4600 Py_ssize_t size,
4601 const char *errors,
4602 Py_ssize_t *consumed)
4603{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004604 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004605 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004606 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004607
4608 Py_ssize_t startinpos;
4609 Py_ssize_t endinpos;
4610 const char *errmsg = "";
4611 PyObject *errorHandler = NULL;
4612 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004613
4614 if (size == 0) {
4615 if (consumed)
4616 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004617 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004618 }
4619
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004620 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4621 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004622 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004623 *consumed = 1;
4624 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004625 }
4626
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004627 _PyUnicodeWriter_Init(&writer, 0);
4628 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
4629 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004630
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004631 writer.pos = ascii_decode(s, end, writer.data);
4632 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004633 while (s < end) {
4634 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004635 int kind = writer.kind;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004636 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004637 if (PyUnicode_IS_ASCII(writer.buffer))
4638 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004639 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004640 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004641 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004642 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004643 } else {
4644 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004645 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004646 }
4647
4648 switch (ch) {
4649 case 0:
4650 if (s == end || consumed)
4651 goto End;
4652 errmsg = "unexpected end of data";
4653 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004654 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004655 break;
4656 case 1:
4657 errmsg = "invalid start byte";
4658 startinpos = s - starts;
4659 endinpos = startinpos + 1;
4660 break;
4661 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004662 case 3:
4663 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004664 errmsg = "invalid continuation byte";
4665 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004666 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004667 break;
4668 default:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004669 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004670 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004671 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
4672 writer.pos++;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004673 continue;
4674 }
4675
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004676 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004677 errors, &errorHandler,
4678 "utf-8", errmsg,
4679 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004680 &writer))
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004681 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004682 }
4683
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004684End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004685 if (consumed)
4686 *consumed = s - starts;
4687
4688 Py_XDECREF(errorHandler);
4689 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004690 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004691
4692onError:
4693 Py_XDECREF(errorHandler);
4694 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004695 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004696 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004697}
4698
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004699#ifdef __APPLE__
4700
4701/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004702 used to decode the command line arguments on Mac OS X.
4703
4704 Return a pointer to a newly allocated wide character string (use
4705 PyMem_Free() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004706
4707wchar_t*
4708_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4709{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004710 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004711 wchar_t *unicode;
4712 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004713
4714 /* Note: size will always be longer than the resulting Unicode
4715 character count */
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004716 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004717 return NULL;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004718 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4719 if (!unicode)
4720 return NULL;
4721
4722 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004723 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004724 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004725 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004726 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004727#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004728 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004729#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004730 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004731#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004732 if (ch > 0xFF) {
4733#if SIZEOF_WCHAR_T == 4
4734 assert(0);
4735#else
4736 assert(Py_UNICODE_IS_SURROGATE(ch));
4737 /* compute and append the two surrogates: */
4738 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4739 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4740#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004741 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004742 else {
4743 if (!ch && s == e)
4744 break;
4745 /* surrogateescape */
4746 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4747 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004748 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004749 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004750 return unicode;
4751}
4752
4753#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004754
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004755/* Primary internal function which creates utf8 encoded bytes objects.
4756
4757 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004758 and allocate exactly as much space needed at the end. Else allocate the
4759 maximum possible needed (4 result bytes per Unicode character), and return
4760 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004761*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004762PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004763_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004764{
Victor Stinner6099a032011-12-18 14:22:26 +01004765 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004766 void *data;
4767 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004768
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004769 if (!PyUnicode_Check(unicode)) {
4770 PyErr_BadArgument();
4771 return NULL;
4772 }
4773
4774 if (PyUnicode_READY(unicode) == -1)
4775 return NULL;
4776
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004777 if (PyUnicode_UTF8(unicode))
4778 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4779 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004780
4781 kind = PyUnicode_KIND(unicode);
4782 data = PyUnicode_DATA(unicode);
4783 size = PyUnicode_GET_LENGTH(unicode);
4784
Benjamin Petersonead6b532011-12-20 17:23:42 -06004785 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01004786 default:
4787 assert(0);
4788 case PyUnicode_1BYTE_KIND:
4789 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4790 assert(!PyUnicode_IS_ASCII(unicode));
4791 return ucs1lib_utf8_encoder(unicode, data, size, errors);
4792 case PyUnicode_2BYTE_KIND:
4793 return ucs2lib_utf8_encoder(unicode, data, size, errors);
4794 case PyUnicode_4BYTE_KIND:
4795 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00004796 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004797}
4798
Alexander Belopolsky40018472011-02-26 01:02:56 +00004799PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004800PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4801 Py_ssize_t size,
4802 const char *errors)
4803{
4804 PyObject *v, *unicode;
4805
4806 unicode = PyUnicode_FromUnicode(s, size);
4807 if (unicode == NULL)
4808 return NULL;
4809 v = _PyUnicode_AsUTF8String(unicode, errors);
4810 Py_DECREF(unicode);
4811 return v;
4812}
4813
4814PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004815PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004816{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004817 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004818}
4819
Walter Dörwald41980ca2007-08-16 21:55:45 +00004820/* --- UTF-32 Codec ------------------------------------------------------- */
4821
4822PyObject *
4823PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004824 Py_ssize_t size,
4825 const char *errors,
4826 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004827{
4828 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4829}
4830
4831PyObject *
4832PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004833 Py_ssize_t size,
4834 const char *errors,
4835 int *byteorder,
4836 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004837{
4838 const char *starts = s;
4839 Py_ssize_t startinpos;
4840 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004841 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004842 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01004843 int le, bo = 0; /* assume native ordering by default */
Walter Dörwald41980ca2007-08-16 21:55:45 +00004844 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004845 PyObject *errorHandler = NULL;
4846 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004847
Walter Dörwald41980ca2007-08-16 21:55:45 +00004848 q = (unsigned char *)s;
4849 e = q + size;
4850
4851 if (byteorder)
4852 bo = *byteorder;
4853
4854 /* Check for BOM marks (U+FEFF) in the input and adjust current
4855 byte order setting accordingly. In native mode, the leading BOM
4856 mark is skipped, in all other modes, it is copied to the output
4857 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01004858 if (bo == 0 && size >= 4) {
4859 Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4860 if (bom == 0x0000FEFF) {
4861 bo = -1;
4862 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00004863 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004864 else if (bom == 0xFFFE0000) {
4865 bo = 1;
4866 q += 4;
4867 }
4868 if (byteorder)
4869 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004870 }
4871
Victor Stinnere64322e2012-10-30 23:12:47 +01004872 if (q == e) {
4873 if (consumed)
4874 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004875 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00004876 }
4877
Victor Stinnere64322e2012-10-30 23:12:47 +01004878#ifdef WORDS_BIGENDIAN
4879 le = bo < 0;
4880#else
4881 le = bo <= 0;
4882#endif
4883
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004884 _PyUnicodeWriter_Init(&writer, 0);
4885 if (_PyUnicodeWriter_Prepare(&writer, (e - q + 3) / 4, 127) == -1)
4886 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01004887
Victor Stinnere64322e2012-10-30 23:12:47 +01004888 while (1) {
4889 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004890 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004891
Victor Stinnere64322e2012-10-30 23:12:47 +01004892 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004893 enum PyUnicode_Kind kind = writer.kind;
4894 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01004895 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004896 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01004897 if (le) {
4898 do {
4899 ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4900 if (ch > maxch)
4901 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004902 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01004903 q += 4;
4904 } while (q <= last);
4905 }
4906 else {
4907 do {
4908 ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
4909 if (ch > maxch)
4910 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004911 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01004912 q += 4;
4913 } while (q <= last);
4914 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004915 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01004916 }
4917
4918 if (ch <= maxch) {
4919 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004920 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01004921 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00004922 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01004923 startinpos = ((const char *)q) - starts;
4924 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00004925 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004926 else {
4927 if (ch < 0x110000) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004928 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
Victor Stinnere64322e2012-10-30 23:12:47 +01004929 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004930 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
4931 writer.pos++;
Victor Stinnere64322e2012-10-30 23:12:47 +01004932 q += 4;
4933 continue;
4934 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004935 errmsg = "codepoint not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01004936 startinpos = ((const char *)q) - starts;
4937 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00004938 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004939
4940 /* The remaining input chars are ignored if the callback
4941 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004942 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004943 errors, &errorHandler,
4944 "utf32", errmsg,
4945 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004946 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004947 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004948 }
4949
Walter Dörwald41980ca2007-08-16 21:55:45 +00004950 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004951 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004952
Walter Dörwald41980ca2007-08-16 21:55:45 +00004953 Py_XDECREF(errorHandler);
4954 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004955 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004956
Benjamin Peterson29060642009-01-31 22:14:21 +00004957 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004958 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004959 Py_XDECREF(errorHandler);
4960 Py_XDECREF(exc);
4961 return NULL;
4962}
4963
4964PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004965_PyUnicode_EncodeUTF32(PyObject *str,
4966 const char *errors,
4967 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004968{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004969 int kind;
4970 void *data;
4971 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004972 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004973 unsigned char *p;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004974 Py_ssize_t nsize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004975 /* Offsets from p for storing byte pairs in the right order. */
Christian Heimes743e0cd2012-10-17 23:52:17 +02004976#if PY_LITTLE_ENDIAN
Walter Dörwald41980ca2007-08-16 21:55:45 +00004977 int iorder[] = {0, 1, 2, 3};
4978#else
4979 int iorder[] = {3, 2, 1, 0};
4980#endif
4981
Benjamin Peterson29060642009-01-31 22:14:21 +00004982#define STORECHAR(CH) \
4983 do { \
4984 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4985 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4986 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4987 p[iorder[0]] = (CH) & 0xff; \
4988 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004989 } while(0)
4990
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004991 if (!PyUnicode_Check(str)) {
4992 PyErr_BadArgument();
4993 return NULL;
4994 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05004995 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004996 return NULL;
4997 kind = PyUnicode_KIND(str);
4998 data = PyUnicode_DATA(str);
4999 len = PyUnicode_GET_LENGTH(str);
5000
5001 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005002 if (nsize > PY_SSIZE_T_MAX / 4)
Benjamin Peterson29060642009-01-31 22:14:21 +00005003 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005004 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005005 if (v == NULL)
5006 return NULL;
5007
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005008 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005009 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005010 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005011 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005012 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005013
5014 if (byteorder == -1) {
5015 /* force LE */
5016 iorder[0] = 0;
5017 iorder[1] = 1;
5018 iorder[2] = 2;
5019 iorder[3] = 3;
5020 }
5021 else if (byteorder == 1) {
5022 /* force BE */
5023 iorder[0] = 3;
5024 iorder[1] = 2;
5025 iorder[2] = 1;
5026 iorder[3] = 0;
5027 }
5028
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005029 for (i = 0; i < len; i++)
5030 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005031
5032 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005033 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005034#undef STORECHAR
5035}
5036
Alexander Belopolsky40018472011-02-26 01:02:56 +00005037PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005038PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5039 Py_ssize_t size,
5040 const char *errors,
5041 int byteorder)
5042{
5043 PyObject *result;
5044 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5045 if (tmp == NULL)
5046 return NULL;
5047 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5048 Py_DECREF(tmp);
5049 return result;
5050}
5051
5052PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005053PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005054{
Victor Stinnerb960b342011-11-20 19:12:52 +01005055 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005056}
5057
Guido van Rossumd57fd912000-03-10 22:53:23 +00005058/* --- UTF-16 Codec ------------------------------------------------------- */
5059
Tim Peters772747b2001-08-09 22:21:55 +00005060PyObject *
5061PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005062 Py_ssize_t size,
5063 const char *errors,
5064 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005065{
Walter Dörwald69652032004-09-07 20:24:22 +00005066 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5067}
5068
5069PyObject *
5070PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005071 Py_ssize_t size,
5072 const char *errors,
5073 int *byteorder,
5074 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005075{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005076 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005077 Py_ssize_t startinpos;
5078 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005079 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005080 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005081 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005082 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005083 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005084 PyObject *errorHandler = NULL;
5085 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005086
Tim Peters772747b2001-08-09 22:21:55 +00005087 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005088 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005089
5090 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005091 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005092
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005093 /* Check for BOM marks (U+FEFF) in the input and adjust current
5094 byte order setting accordingly. In native mode, the leading BOM
5095 mark is skipped, in all other modes, it is copied to the output
5096 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005097 if (bo == 0 && size >= 2) {
5098 const Py_UCS4 bom = (q[1] << 8) | q[0];
5099 if (bom == 0xFEFF) {
5100 q += 2;
5101 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005102 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005103 else if (bom == 0xFFFE) {
5104 q += 2;
5105 bo = 1;
5106 }
5107 if (byteorder)
5108 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005109 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005110
Antoine Pitrou63065d72012-05-15 23:48:04 +02005111 if (q == e) {
5112 if (consumed)
5113 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005114 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005115 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005116
Christian Heimes743e0cd2012-10-17 23:52:17 +02005117#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005118 native_ordering = bo <= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005119#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005120 native_ordering = bo >= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005121#endif
Tim Peters772747b2001-08-09 22:21:55 +00005122
Antoine Pitrou63065d72012-05-15 23:48:04 +02005123 /* Note: size will always be longer than the resulting Unicode
5124 character count */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005125 _PyUnicodeWriter_Init(&writer, 0);
5126 if (_PyUnicodeWriter_Prepare(&writer, (e - q + 1) / 2, 127) == -1)
5127 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005128
Antoine Pitrou63065d72012-05-15 23:48:04 +02005129 while (1) {
5130 Py_UCS4 ch = 0;
5131 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005132 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005133 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005134 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005135 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005136 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005137 native_ordering);
5138 else
5139 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005140 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005141 native_ordering);
5142 } else if (kind == PyUnicode_2BYTE_KIND) {
5143 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005144 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005145 native_ordering);
5146 } else {
5147 assert(kind == PyUnicode_4BYTE_KIND);
5148 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005149 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005150 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005151 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005152 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005153
Antoine Pitrou63065d72012-05-15 23:48:04 +02005154 switch (ch)
5155 {
5156 case 0:
5157 /* remaining byte at the end? (size should be even) */
5158 if (q == e || consumed)
5159 goto End;
5160 errmsg = "truncated data";
5161 startinpos = ((const char *)q) - starts;
5162 endinpos = ((const char *)e) - starts;
5163 break;
5164 /* The remaining input chars are ignored if the callback
5165 chooses to skip the input */
5166 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005167 q -= 2;
5168 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005169 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005170 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005171 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005172 endinpos = ((const char *)e) - starts;
5173 break;
5174 case 2:
5175 errmsg = "illegal encoding";
5176 startinpos = ((const char *)q) - 2 - starts;
5177 endinpos = startinpos + 2;
5178 break;
5179 case 3:
5180 errmsg = "illegal UTF-16 surrogate";
5181 startinpos = ((const char *)q) - 4 - starts;
5182 endinpos = startinpos + 2;
5183 break;
5184 default:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005185 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005186 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005187 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
5188 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00005189 continue;
5190 }
5191
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005192 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005193 errors,
5194 &errorHandler,
5195 "utf16", errmsg,
5196 &starts,
5197 (const char **)&e,
5198 &startinpos,
5199 &endinpos,
5200 &exc,
5201 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005202 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005203 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005204 }
5205
Antoine Pitrou63065d72012-05-15 23:48:04 +02005206End:
Walter Dörwald69652032004-09-07 20:24:22 +00005207 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005208 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005209
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005210 Py_XDECREF(errorHandler);
5211 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005212 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005213
Benjamin Peterson29060642009-01-31 22:14:21 +00005214 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005215 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005216 Py_XDECREF(errorHandler);
5217 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005218 return NULL;
5219}
5220
Tim Peters772747b2001-08-09 22:21:55 +00005221PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005222_PyUnicode_EncodeUTF16(PyObject *str,
5223 const char *errors,
5224 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005225{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005226 enum PyUnicode_Kind kind;
5227 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005228 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005229 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005230 unsigned short *out;
5231 Py_ssize_t bytesize;
5232 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005233#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005234 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005235#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005236 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005237#endif
5238
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005239 if (!PyUnicode_Check(str)) {
5240 PyErr_BadArgument();
5241 return NULL;
5242 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005243 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005244 return NULL;
5245 kind = PyUnicode_KIND(str);
5246 data = PyUnicode_DATA(str);
5247 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005248
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005249 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005250 if (kind == PyUnicode_4BYTE_KIND) {
5251 const Py_UCS4 *in = (const Py_UCS4 *)data;
5252 const Py_UCS4 *end = in + len;
5253 while (in < end)
5254 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005255 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005256 }
5257 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005258 return PyErr_NoMemory();
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005259 bytesize = (len + pairs + (byteorder == 0)) * 2;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005260 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005261 if (v == NULL)
5262 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005263
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005264 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005265 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005266 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005267 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005268 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005269 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005270 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005271
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005272 switch (kind) {
5273 case PyUnicode_1BYTE_KIND: {
5274 ucs1lib_utf16_encode(out, (const Py_UCS1 *)data, len, native_ordering);
5275 break;
Tim Peters772747b2001-08-09 22:21:55 +00005276 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005277 case PyUnicode_2BYTE_KIND: {
5278 ucs2lib_utf16_encode(out, (const Py_UCS2 *)data, len, native_ordering);
5279 break;
Tim Peters772747b2001-08-09 22:21:55 +00005280 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005281 case PyUnicode_4BYTE_KIND: {
5282 ucs4lib_utf16_encode(out, (const Py_UCS4 *)data, len, native_ordering);
5283 break;
5284 }
5285 default:
5286 assert(0);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005287 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005288
5289 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005290 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005291}
5292
Alexander Belopolsky40018472011-02-26 01:02:56 +00005293PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005294PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5295 Py_ssize_t size,
5296 const char *errors,
5297 int byteorder)
5298{
5299 PyObject *result;
5300 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5301 if (tmp == NULL)
5302 return NULL;
5303 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5304 Py_DECREF(tmp);
5305 return result;
5306}
5307
5308PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005309PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005310{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005311 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005312}
5313
5314/* --- Unicode Escape Codec ----------------------------------------------- */
5315
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005316/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5317 if all the escapes in the string make it still a valid ASCII string.
5318 Returns -1 if any escapes were found which cause the string to
5319 pop out of ASCII range. Otherwise returns the length of the
5320 required buffer to hold the string.
5321 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005322static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005323length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5324{
5325 const unsigned char *p = (const unsigned char *)s;
5326 const unsigned char *end = p + size;
5327 Py_ssize_t length = 0;
5328
5329 if (size < 0)
5330 return -1;
5331
5332 for (; p < end; ++p) {
5333 if (*p > 127) {
5334 /* Non-ASCII */
5335 return -1;
5336 }
5337 else if (*p != '\\') {
5338 /* Normal character */
5339 ++length;
5340 }
5341 else {
5342 /* Backslash-escape, check next char */
5343 ++p;
5344 /* Escape sequence reaches till end of string or
5345 non-ASCII follow-up. */
5346 if (p >= end || *p > 127)
5347 return -1;
5348 switch (*p) {
5349 case '\n':
5350 /* backslash + \n result in zero characters */
5351 break;
5352 case '\\': case '\'': case '\"':
5353 case 'b': case 'f': case 't':
5354 case 'n': case 'r': case 'v': case 'a':
5355 ++length;
5356 break;
5357 case '0': case '1': case '2': case '3':
5358 case '4': case '5': case '6': case '7':
5359 case 'x': case 'u': case 'U': case 'N':
5360 /* these do not guarantee ASCII characters */
5361 return -1;
5362 default:
5363 /* count the backslash + the other character */
5364 length += 2;
5365 }
5366 }
5367 }
5368 return length;
5369}
5370
Fredrik Lundh06d12682001-01-24 07:59:11 +00005371static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005372
Alexander Belopolsky40018472011-02-26 01:02:56 +00005373PyObject *
5374PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005375 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005376 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005377{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005378 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005379 Py_ssize_t startinpos;
5380 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005381 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005382 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005383 char* message;
5384 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005385 PyObject *errorHandler = NULL;
5386 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005387 Py_ssize_t len;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005388
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005389 len = length_of_escaped_ascii_string(s, size);
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005390 if (len == 0)
5391 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005392
5393 /* After length_of_escaped_ascii_string() there are two alternatives,
5394 either the string is pure ASCII with named escapes like \n, etc.
5395 and we determined it's exact size (common case)
5396 or it contains \x, \u, ... escape sequences. then we create a
5397 legacy wchar string and resize it at the end of this function. */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005398 _PyUnicodeWriter_Init(&writer, 0);
5399 if (len > 0) {
5400 if (_PyUnicodeWriter_Prepare(&writer, len, 127) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005401 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005402 assert(writer.kind == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005403 }
5404 else {
5405 /* Escaped strings will always be longer than the resulting
5406 Unicode string, so we start with size here and then reduce the
5407 length after conversion to the true value.
5408 (but if the error callback returns a long replacement string
5409 we'll have to allocate more space) */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005410 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005411 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005412 }
5413
Guido van Rossumd57fd912000-03-10 22:53:23 +00005414 if (size == 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005415 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005416 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005417
Guido van Rossumd57fd912000-03-10 22:53:23 +00005418 while (s < end) {
5419 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005420 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005421 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005422
5423 /* Non-escape characters are interpreted as Unicode ordinals */
5424 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005425 x = (unsigned char)*s;
5426 s++;
5427 if (_PyUnicodeWriter_Prepare(&writer, 1, x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005428 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005429 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, x);
5430 writer.pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005431 continue;
5432 }
5433
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005434 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005435 /* \ - Escapes */
5436 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005437 c = *s++;
5438 if (s > end)
5439 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005440
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005441 /* The only case in which i == ascii_length is a backslash
5442 followed by a newline. */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005443 assert(writer.pos < writer.size || (writer.pos == writer.size && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005444
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005445 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005446
Benjamin Peterson29060642009-01-31 22:14:21 +00005447 /* \x escapes */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005448#define WRITECHAR(ch) \
5449 do { \
5450 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1) \
5451 goto onError; \
5452 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch); \
5453 writer.pos++; \
5454 } while(0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005455
Guido van Rossumd57fd912000-03-10 22:53:23 +00005456 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005457 case '\\': WRITECHAR('\\'); break;
5458 case '\'': WRITECHAR('\''); break;
5459 case '\"': WRITECHAR('\"'); break;
5460 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005461 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005462 case 'f': WRITECHAR('\014'); break;
5463 case 't': WRITECHAR('\t'); break;
5464 case 'n': WRITECHAR('\n'); break;
5465 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005466 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005467 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005468 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005469 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005470
Benjamin Peterson29060642009-01-31 22:14:21 +00005471 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005472 case '0': case '1': case '2': case '3':
5473 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005474 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005475 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005476 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005477 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005478 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005479 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005480 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005481 break;
5482
Benjamin Peterson29060642009-01-31 22:14:21 +00005483 /* hex escapes */
5484 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005485 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005486 digits = 2;
5487 message = "truncated \\xXX escape";
5488 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005489
Benjamin Peterson29060642009-01-31 22:14:21 +00005490 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005491 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005492 digits = 4;
5493 message = "truncated \\uXXXX escape";
5494 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005495
Benjamin Peterson29060642009-01-31 22:14:21 +00005496 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005497 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005498 digits = 8;
5499 message = "truncated \\UXXXXXXXX escape";
5500 hexescape:
5501 chr = 0;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005502 if (end - s < digits) {
5503 /* count only hex digits */
5504 for (; s < end; ++s) {
5505 c = (unsigned char)*s;
5506 if (!Py_ISXDIGIT(c))
5507 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005508 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005509 goto error;
5510 }
5511 for (; digits--; ++s) {
5512 c = (unsigned char)*s;
5513 if (!Py_ISXDIGIT(c))
5514 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005515 chr = (chr<<4) & ~0xF;
5516 if (c >= '0' && c <= '9')
5517 chr += c - '0';
5518 else if (c >= 'a' && c <= 'f')
5519 chr += 10 + c - 'a';
5520 else
5521 chr += 10 + c - 'A';
5522 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005523 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005524 /* _decoding_error will have already written into the
5525 target buffer. */
5526 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005527 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005528 /* when we get here, chr is a 32-bit unicode character */
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005529 message = "illegal Unicode character";
5530 if (chr > MAX_UNICODE)
Serhiy Storchakad6793772013-01-29 10:20:44 +02005531 goto error;
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005532 WRITECHAR(chr);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005533 break;
5534
Benjamin Peterson29060642009-01-31 22:14:21 +00005535 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005536 case 'N':
5537 message = "malformed \\N character escape";
5538 if (ucnhash_CAPI == NULL) {
5539 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005540 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5541 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005542 if (ucnhash_CAPI == NULL)
5543 goto ucnhashError;
5544 }
5545 if (*s == '{') {
5546 const char *start = s+1;
5547 /* look for the closing brace */
5548 while (*s != '}' && s < end)
5549 s++;
5550 if (s > start && s < end && *s == '}') {
5551 /* found a name. look it up in the unicode database */
5552 message = "unknown Unicode character name";
5553 s++;
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +02005554 if (s - start - 1 <= INT_MAX &&
Serhiy Storchakac35f3a92013-01-21 11:42:57 +02005555 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005556 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005557 goto store;
5558 }
5559 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005560 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005561
5562 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005563 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005564 message = "\\ at end of string";
5565 s--;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005566 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00005567 }
5568 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005569 WRITECHAR('\\');
Serhiy Storchaka73e38802013-01-25 23:52:21 +02005570 WRITECHAR((unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005571 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005572 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005573 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005574 continue;
5575
5576 error:
5577 endinpos = s-starts;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005578 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02005579 errors, &errorHandler,
5580 "unicodeescape", message,
5581 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005582 &writer))
Serhiy Storchakad6793772013-01-29 10:20:44 +02005583 goto onError;
5584 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005585 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005586#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005587
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005588 Py_XDECREF(errorHandler);
5589 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005590 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00005591
Benjamin Peterson29060642009-01-31 22:14:21 +00005592 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005593 PyErr_SetString(
5594 PyExc_UnicodeError,
5595 "\\N escapes not supported (can't load unicodedata module)"
5596 );
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005597 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005598 Py_XDECREF(errorHandler);
5599 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005600 return NULL;
5601
Benjamin Peterson29060642009-01-31 22:14:21 +00005602 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005603 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005604 Py_XDECREF(errorHandler);
5605 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005606 return NULL;
5607}
5608
5609/* Return a Unicode-Escape string version of the Unicode object.
5610
5611 If quotes is true, the string is enclosed in u"" or u'' quotes as
5612 appropriate.
5613
5614*/
5615
Alexander Belopolsky40018472011-02-26 01:02:56 +00005616PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005617PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005618{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005619 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005620 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005621 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005622 int kind;
5623 void *data;
5624 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005625
Ezio Melottie7f90372012-10-05 03:33:31 +03005626 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00005627 escape.
5628
Ezio Melottie7f90372012-10-05 03:33:31 +03005629 For UCS1 strings it's '\xxx', 4 bytes per source character.
5630 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5631 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00005632 */
5633
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005634 if (!PyUnicode_Check(unicode)) {
5635 PyErr_BadArgument();
5636 return NULL;
5637 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005638 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005639 return NULL;
5640 len = PyUnicode_GET_LENGTH(unicode);
5641 kind = PyUnicode_KIND(unicode);
5642 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06005643 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005644 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5645 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5646 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5647 }
5648
5649 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005650 return PyBytes_FromStringAndSize(NULL, 0);
5651
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005652 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005653 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005654
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005655 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005656 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005657 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005658 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005659 if (repr == NULL)
5660 return NULL;
5661
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005662 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005663
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005664 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005665 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005666
Walter Dörwald79e913e2007-05-12 11:08:06 +00005667 /* Escape backslashes */
5668 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005669 *p++ = '\\';
5670 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005671 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005672 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005673
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005674 /* Map 21-bit characters to '\U00xxxxxx' */
5675 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005676 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005677 *p++ = '\\';
5678 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005679 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5680 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5681 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5682 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5683 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5684 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5685 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5686 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005687 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005688 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005689
Guido van Rossumd57fd912000-03-10 22:53:23 +00005690 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005691 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005692 *p++ = '\\';
5693 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005694 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5695 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5696 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5697 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005698 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005699
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005700 /* Map special whitespace to '\t', \n', '\r' */
5701 else if (ch == '\t') {
5702 *p++ = '\\';
5703 *p++ = 't';
5704 }
5705 else if (ch == '\n') {
5706 *p++ = '\\';
5707 *p++ = 'n';
5708 }
5709 else if (ch == '\r') {
5710 *p++ = '\\';
5711 *p++ = 'r';
5712 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005713
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005714 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005715 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005716 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005717 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005718 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5719 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005720 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005721
Guido van Rossumd57fd912000-03-10 22:53:23 +00005722 /* Copy everything else as-is */
5723 else
5724 *p++ = (char) ch;
5725 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005726
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005727 assert(p - PyBytes_AS_STRING(repr) > 0);
5728 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5729 return NULL;
5730 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005731}
5732
Alexander Belopolsky40018472011-02-26 01:02:56 +00005733PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005734PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5735 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005736{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005737 PyObject *result;
5738 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5739 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005740 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005741 result = PyUnicode_AsUnicodeEscapeString(tmp);
5742 Py_DECREF(tmp);
5743 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005744}
5745
5746/* --- Raw Unicode Escape Codec ------------------------------------------- */
5747
Alexander Belopolsky40018472011-02-26 01:02:56 +00005748PyObject *
5749PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005750 Py_ssize_t size,
5751 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005752{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005753 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005754 Py_ssize_t startinpos;
5755 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005756 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005757 const char *end;
5758 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005759 PyObject *errorHandler = NULL;
5760 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005761
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005762 if (size == 0)
5763 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005764
Guido van Rossumd57fd912000-03-10 22:53:23 +00005765 /* Escaped strings will always be longer than the resulting
5766 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005767 length after conversion to the true value. (But decoding error
5768 handler might have to resize the string) */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005769 _PyUnicodeWriter_Init(&writer, 1);
5770 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00005771 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005772
Guido van Rossumd57fd912000-03-10 22:53:23 +00005773 end = s + size;
5774 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005775 unsigned char c;
5776 Py_UCS4 x;
5777 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005778 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005779
Benjamin Peterson29060642009-01-31 22:14:21 +00005780 /* Non-escape characters are interpreted as Unicode ordinals */
5781 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005782 x = (unsigned char)*s++;
5783 if (_PyUnicodeWriter_Prepare(&writer, 1, x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005784 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005785 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, x);
5786 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00005787 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005788 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005789 startinpos = s-starts;
5790
5791 /* \u-escapes are only interpreted iff the number of leading
5792 backslashes if odd */
5793 bs = s;
5794 for (;s < end;) {
5795 if (*s != '\\')
5796 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005797 x = (unsigned char)*s++;
5798 if (_PyUnicodeWriter_Prepare(&writer, 1, x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005799 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005800 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, x);
5801 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00005802 }
5803 if (((s - bs) & 1) == 0 ||
5804 s >= end ||
5805 (*s != 'u' && *s != 'U')) {
5806 continue;
5807 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005808 writer.pos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00005809 count = *s=='u' ? 4 : 8;
5810 s++;
5811
5812 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00005813 for (x = 0, i = 0; i < count; ++i, ++s) {
5814 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005815 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005816 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005817 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005818 errors, &errorHandler,
5819 "rawunicodeescape", "truncated \\uXXXX",
5820 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005821 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005822 goto onError;
5823 goto nextByte;
5824 }
5825 x = (x<<4) & ~0xF;
5826 if (c >= '0' && c <= '9')
5827 x += c - '0';
5828 else if (c >= 'a' && c <= 'f')
5829 x += 10 + c - 'a';
5830 else
5831 x += 10 + c - 'A';
5832 }
Victor Stinner8faf8212011-12-08 22:14:11 +01005833 if (x <= MAX_UNICODE) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005834 if (_PyUnicodeWriter_Prepare(&writer, 1, x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005835 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005836 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, x);
5837 writer.pos++;
5838 }
5839 else {
Christian Heimesfe337bf2008-03-23 21:54:12 +00005840 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005841 if (unicode_decode_call_errorhandler_writer(
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005842 errors, &errorHandler,
5843 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005844 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005845 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005846 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005847 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005848 nextByte:
5849 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005850 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005851 Py_XDECREF(errorHandler);
5852 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005853 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00005854
Benjamin Peterson29060642009-01-31 22:14:21 +00005855 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005856 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005857 Py_XDECREF(errorHandler);
5858 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005859 return NULL;
5860}
5861
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005862
Alexander Belopolsky40018472011-02-26 01:02:56 +00005863PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005864PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005865{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005866 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005867 char *p;
5868 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005869 Py_ssize_t expandsize, pos;
5870 int kind;
5871 void *data;
5872 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005873
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005874 if (!PyUnicode_Check(unicode)) {
5875 PyErr_BadArgument();
5876 return NULL;
5877 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005878 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005879 return NULL;
5880 kind = PyUnicode_KIND(unicode);
5881 data = PyUnicode_DATA(unicode);
5882 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06005883 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
5884 bytes, and 1 byte characters 4. */
5885 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01005886
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005887 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005888 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005889
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005890 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005891 if (repr == NULL)
5892 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005893 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005894 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005895
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005896 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005897 for (pos = 0; pos < len; pos++) {
5898 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00005899 /* Map 32-bit characters to '\Uxxxxxxxx' */
5900 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005901 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005902 *p++ = '\\';
5903 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005904 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
5905 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
5906 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
5907 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
5908 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
5909 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
5910 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
5911 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005912 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005913 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005914 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005915 *p++ = '\\';
5916 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005917 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
5918 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
5919 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
5920 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005921 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005922 /* Copy everything else as-is */
5923 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00005924 *p++ = (char) ch;
5925 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005926
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005927 assert(p > q);
5928 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005929 return NULL;
5930 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005931}
5932
Alexander Belopolsky40018472011-02-26 01:02:56 +00005933PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005934PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
5935 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005936{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005937 PyObject *result;
5938 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5939 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00005940 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005941 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
5942 Py_DECREF(tmp);
5943 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005944}
5945
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005946/* --- Unicode Internal Codec ------------------------------------------- */
5947
Alexander Belopolsky40018472011-02-26 01:02:56 +00005948PyObject *
5949_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005950 Py_ssize_t size,
5951 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005952{
5953 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005954 Py_ssize_t startinpos;
5955 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005956 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005957 const char *end;
5958 const char *reason;
5959 PyObject *errorHandler = NULL;
5960 PyObject *exc = NULL;
5961
Victor Stinner9f4b1e92011-11-10 20:56:30 +01005962 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02005963 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01005964 1))
5965 return NULL;
5966
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005967 if (size == 0)
5968 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005969
Thomas Wouters89f507f2006-12-13 04:49:30 +00005970 /* XXX overflow detection missing */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005971 _PyUnicodeWriter_Init(&writer, 0);
5972 if (_PyUnicodeWriter_Prepare(&writer, (size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00005973 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005974 end = s + size;
5975
5976 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01005977 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01005978 Py_UCS4 ch;
5979 /* We copy the raw representation one byte at a time because the
5980 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01005981 ((char *) &uch)[0] = s[0];
5982 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01005983#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01005984 ((char *) &uch)[2] = s[2];
5985 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01005986#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01005987 ch = uch;
5988
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005989 /* We have to sanity check the raw data, otherwise doom looms for
5990 some malformed UCS-4 data. */
5991 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00005992#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005993 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00005994#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005995 end-s < Py_UNICODE_SIZE
5996 )
Benjamin Peterson29060642009-01-31 22:14:21 +00005997 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005998 startinpos = s - starts;
5999 if (end-s < Py_UNICODE_SIZE) {
6000 endinpos = end-starts;
6001 reason = "truncated input";
6002 }
6003 else {
6004 endinpos = s - starts + Py_UNICODE_SIZE;
6005 reason = "illegal code point (> 0x10FFFF)";
6006 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006007 if (unicode_decode_call_errorhandler_writer(
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006008 errors, &errorHandler,
6009 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006010 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006011 &writer))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006012 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006013 continue;
6014 }
6015
6016 s += Py_UNICODE_SIZE;
6017#ifndef Py_UNICODE_WIDE
Victor Stinner551ac952011-11-29 22:58:13 +01006018 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006019 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006020 Py_UNICODE uch2;
6021 ((char *) &uch2)[0] = s[0];
6022 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006023 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006024 {
Victor Stinner551ac952011-11-29 22:58:13 +01006025 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006026 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006027 }
6028 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006029#endif
6030
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006031 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006032 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006033 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
6034 writer.pos++;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006035 }
6036
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006037 Py_XDECREF(errorHandler);
6038 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006039 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006040
Benjamin Peterson29060642009-01-31 22:14:21 +00006041 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006042 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006043 Py_XDECREF(errorHandler);
6044 Py_XDECREF(exc);
6045 return NULL;
6046}
6047
Guido van Rossumd57fd912000-03-10 22:53:23 +00006048/* --- Latin-1 Codec ------------------------------------------------------ */
6049
Alexander Belopolsky40018472011-02-26 01:02:56 +00006050PyObject *
6051PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006052 Py_ssize_t size,
6053 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006054{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006055 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006056 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006057}
6058
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006059/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006060static void
6061make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006062 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006063 PyObject *unicode,
6064 Py_ssize_t startpos, Py_ssize_t endpos,
6065 const char *reason)
6066{
6067 if (*exceptionObject == NULL) {
6068 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006069 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006070 encoding, unicode, startpos, endpos, reason);
6071 }
6072 else {
6073 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6074 goto onError;
6075 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6076 goto onError;
6077 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6078 goto onError;
6079 return;
6080 onError:
6081 Py_DECREF(*exceptionObject);
6082 *exceptionObject = NULL;
6083 }
6084}
6085
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006086/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006087static void
6088raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006089 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006090 PyObject *unicode,
6091 Py_ssize_t startpos, Py_ssize_t endpos,
6092 const char *reason)
6093{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006094 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006095 encoding, unicode, startpos, endpos, reason);
6096 if (*exceptionObject != NULL)
6097 PyCodec_StrictErrors(*exceptionObject);
6098}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006099
6100/* error handling callback helper:
6101 build arguments, call the callback and check the arguments,
6102 put the result into newpos and return the replacement string, which
6103 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006104static PyObject *
6105unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006106 PyObject **errorHandler,
6107 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006108 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006109 Py_ssize_t startpos, Py_ssize_t endpos,
6110 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006111{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006112 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006113 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006114 PyObject *restuple;
6115 PyObject *resunicode;
6116
6117 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006118 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006119 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006120 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006121 }
6122
Benjamin Petersonbac79492012-01-14 13:34:47 -05006123 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006124 return NULL;
6125 len = PyUnicode_GET_LENGTH(unicode);
6126
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006127 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006128 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006129 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006130 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006131
6132 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006133 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006134 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006135 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006136 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006137 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006138 Py_DECREF(restuple);
6139 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006140 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006141 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006142 &resunicode, newpos)) {
6143 Py_DECREF(restuple);
6144 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006145 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006146 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6147 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6148 Py_DECREF(restuple);
6149 return NULL;
6150 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006151 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006152 *newpos = len + *newpos;
6153 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006154 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6155 Py_DECREF(restuple);
6156 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006157 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006158 Py_INCREF(resunicode);
6159 Py_DECREF(restuple);
6160 return resunicode;
6161}
6162
Alexander Belopolsky40018472011-02-26 01:02:56 +00006163static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006164unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006165 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006166 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006167{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006168 /* input state */
6169 Py_ssize_t pos=0, size;
6170 int kind;
6171 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006172 /* output object */
6173 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006174 /* pointer into the output */
6175 char *str;
6176 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006177 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006178 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6179 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006180 PyObject *errorHandler = NULL;
6181 PyObject *exc = NULL;
6182 /* the following variable is used for caching string comparisons
6183 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6184 int known_errorHandler = -1;
6185
Benjamin Petersonbac79492012-01-14 13:34:47 -05006186 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006187 return NULL;
6188 size = PyUnicode_GET_LENGTH(unicode);
6189 kind = PyUnicode_KIND(unicode);
6190 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006191 /* allocate enough for a simple encoding without
6192 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006193 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006194 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006195 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006196 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006197 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006198 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006199 ressize = size;
6200
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006201 while (pos < size) {
6202 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006203
Benjamin Peterson29060642009-01-31 22:14:21 +00006204 /* can we encode this? */
6205 if (c<limit) {
6206 /* no overflow check, because we know that the space is enough */
6207 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006208 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006209 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006210 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006211 Py_ssize_t requiredsize;
6212 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006213 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006214 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006215 Py_ssize_t collstart = pos;
6216 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006217 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006218 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006219 ++collend;
6220 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6221 if (known_errorHandler==-1) {
6222 if ((errors==NULL) || (!strcmp(errors, "strict")))
6223 known_errorHandler = 1;
6224 else if (!strcmp(errors, "replace"))
6225 known_errorHandler = 2;
6226 else if (!strcmp(errors, "ignore"))
6227 known_errorHandler = 3;
6228 else if (!strcmp(errors, "xmlcharrefreplace"))
6229 known_errorHandler = 4;
6230 else
6231 known_errorHandler = 0;
6232 }
6233 switch (known_errorHandler) {
6234 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006235 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006236 goto onError;
6237 case 2: /* replace */
6238 while (collstart++<collend)
6239 *str++ = '?'; /* fall through */
6240 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006241 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006242 break;
6243 case 4: /* xmlcharrefreplace */
6244 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006245 /* determine replacement size */
6246 for (i = collstart, repsize = 0; i < collend; ++i) {
6247 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6248 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006249 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006250 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006251 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006252 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006253 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006254 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006255 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006256 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006257 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006258 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006259 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006260 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006261 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006262 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006263 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006264 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006265 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006266 if (requiredsize > ressize) {
6267 if (requiredsize<2*ressize)
6268 requiredsize = 2*ressize;
6269 if (_PyBytes_Resize(&res, requiredsize))
6270 goto onError;
6271 str = PyBytes_AS_STRING(res) + respos;
6272 ressize = requiredsize;
6273 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006274 /* generate replacement */
6275 for (i = collstart; i < collend; ++i) {
6276 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006277 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006278 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006279 break;
6280 default:
6281 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006282 encoding, reason, unicode, &exc,
6283 collstart, collend, &newpos);
6284 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006285 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006286 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006287 if (PyBytes_Check(repunicode)) {
6288 /* Directly copy bytes result to output. */
6289 repsize = PyBytes_Size(repunicode);
6290 if (repsize > 1) {
6291 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006292 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006293 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6294 Py_DECREF(repunicode);
6295 goto onError;
6296 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006297 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006298 ressize += repsize-1;
6299 }
6300 memcpy(str, PyBytes_AsString(repunicode), repsize);
6301 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006302 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006303 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006304 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006305 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006306 /* need more space? (at least enough for what we
6307 have+the replacement+the rest of the string, so
6308 we won't have to check space for encodable characters) */
6309 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006310 repsize = PyUnicode_GET_LENGTH(repunicode);
6311 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006312 if (requiredsize > ressize) {
6313 if (requiredsize<2*ressize)
6314 requiredsize = 2*ressize;
6315 if (_PyBytes_Resize(&res, requiredsize)) {
6316 Py_DECREF(repunicode);
6317 goto onError;
6318 }
6319 str = PyBytes_AS_STRING(res) + respos;
6320 ressize = requiredsize;
6321 }
6322 /* check if there is anything unencodable in the replacement
6323 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006324 for (i = 0; repsize-->0; ++i, ++str) {
6325 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006326 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006327 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006328 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006329 Py_DECREF(repunicode);
6330 goto onError;
6331 }
6332 *str = (char)c;
6333 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006334 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006335 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006336 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006337 }
6338 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006339 /* Resize if we allocated to much */
6340 size = str - PyBytes_AS_STRING(res);
6341 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006342 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006343 if (_PyBytes_Resize(&res, size) < 0)
6344 goto onError;
6345 }
6346
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006347 Py_XDECREF(errorHandler);
6348 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006349 return res;
6350
6351 onError:
6352 Py_XDECREF(res);
6353 Py_XDECREF(errorHandler);
6354 Py_XDECREF(exc);
6355 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006356}
6357
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006358/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006359PyObject *
6360PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006361 Py_ssize_t size,
6362 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006363{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006364 PyObject *result;
6365 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6366 if (unicode == NULL)
6367 return NULL;
6368 result = unicode_encode_ucs1(unicode, errors, 256);
6369 Py_DECREF(unicode);
6370 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006371}
6372
Alexander Belopolsky40018472011-02-26 01:02:56 +00006373PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006374_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006375{
6376 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006377 PyErr_BadArgument();
6378 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006379 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006380 if (PyUnicode_READY(unicode) == -1)
6381 return NULL;
6382 /* Fast path: if it is a one-byte string, construct
6383 bytes object directly. */
6384 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6385 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6386 PyUnicode_GET_LENGTH(unicode));
6387 /* Non-Latin-1 characters present. Defer to above function to
6388 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006389 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006390}
6391
6392PyObject*
6393PyUnicode_AsLatin1String(PyObject *unicode)
6394{
6395 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006396}
6397
6398/* --- 7-bit ASCII Codec -------------------------------------------------- */
6399
Alexander Belopolsky40018472011-02-26 01:02:56 +00006400PyObject *
6401PyUnicode_DecodeASCII(const char *s,
6402 Py_ssize_t size,
6403 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006404{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006405 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006406 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006407 int kind;
6408 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006409 Py_ssize_t startinpos;
6410 Py_ssize_t endinpos;
6411 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006412 const char *e;
6413 PyObject *errorHandler = NULL;
6414 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006415
Guido van Rossumd57fd912000-03-10 22:53:23 +00006416 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006417 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006418
Guido van Rossumd57fd912000-03-10 22:53:23 +00006419 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006420 if (size == 1 && (unsigned char)s[0] < 128)
6421 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006422
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006423 _PyUnicodeWriter_Init(&writer, 0);
6424 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006425 goto onError;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006426
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006427 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006428 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006429 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006430 writer.pos = outpos;
6431 if (writer.pos == size)
6432 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006433
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006434 s += writer.pos;
6435 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006436 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006437 register unsigned char c = (unsigned char)*s;
6438 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006439 PyUnicode_WRITE(kind, data, writer.pos, c);
6440 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006441 ++s;
6442 }
6443 else {
6444 startinpos = s-starts;
6445 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006446 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006447 errors, &errorHandler,
6448 "ascii", "ordinal not in range(128)",
6449 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006450 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006451 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006452 kind = writer.kind;
6453 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00006454 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006455 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006456 Py_XDECREF(errorHandler);
6457 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006458 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006459
Benjamin Peterson29060642009-01-31 22:14:21 +00006460 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006461 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006462 Py_XDECREF(errorHandler);
6463 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006464 return NULL;
6465}
6466
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006467/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006468PyObject *
6469PyUnicode_EncodeASCII(const Py_UNICODE *p,
6470 Py_ssize_t size,
6471 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006472{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006473 PyObject *result;
6474 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6475 if (unicode == NULL)
6476 return NULL;
6477 result = unicode_encode_ucs1(unicode, errors, 128);
6478 Py_DECREF(unicode);
6479 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006480}
6481
Alexander Belopolsky40018472011-02-26 01:02:56 +00006482PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006483_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006484{
6485 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006486 PyErr_BadArgument();
6487 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006488 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006489 if (PyUnicode_READY(unicode) == -1)
6490 return NULL;
6491 /* Fast path: if it is an ASCII-only string, construct bytes object
6492 directly. Else defer to above function to raise the exception. */
6493 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6494 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6495 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006496 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006497}
6498
6499PyObject *
6500PyUnicode_AsASCIIString(PyObject *unicode)
6501{
6502 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006503}
6504
Victor Stinner99b95382011-07-04 14:23:54 +02006505#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006506
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006507/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006508
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006509#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006510#define NEED_RETRY
6511#endif
6512
Victor Stinner3a50e702011-10-18 21:21:00 +02006513#ifndef WC_ERR_INVALID_CHARS
6514# define WC_ERR_INVALID_CHARS 0x0080
6515#endif
6516
6517static char*
6518code_page_name(UINT code_page, PyObject **obj)
6519{
6520 *obj = NULL;
6521 if (code_page == CP_ACP)
6522 return "mbcs";
6523 if (code_page == CP_UTF7)
6524 return "CP_UTF7";
6525 if (code_page == CP_UTF8)
6526 return "CP_UTF8";
6527
6528 *obj = PyBytes_FromFormat("cp%u", code_page);
6529 if (*obj == NULL)
6530 return NULL;
6531 return PyBytes_AS_STRING(*obj);
6532}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006533
Alexander Belopolsky40018472011-02-26 01:02:56 +00006534static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006535is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006536{
6537 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006538 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006539
Victor Stinner3a50e702011-10-18 21:21:00 +02006540 if (!IsDBCSLeadByteEx(code_page, *curr))
6541 return 0;
6542
6543 prev = CharPrevExA(code_page, s, curr, 0);
6544 if (prev == curr)
6545 return 1;
6546 /* FIXME: This code is limited to "true" double-byte encodings,
6547 as it assumes an incomplete character consists of a single
6548 byte. */
6549 if (curr - prev == 2)
6550 return 1;
6551 if (!IsDBCSLeadByteEx(code_page, *prev))
6552 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006553 return 0;
6554}
6555
Victor Stinner3a50e702011-10-18 21:21:00 +02006556static DWORD
6557decode_code_page_flags(UINT code_page)
6558{
6559 if (code_page == CP_UTF7) {
6560 /* The CP_UTF7 decoder only supports flags=0 */
6561 return 0;
6562 }
6563 else
6564 return MB_ERR_INVALID_CHARS;
6565}
6566
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006567/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006568 * Decode a byte string from a Windows code page into unicode object in strict
6569 * mode.
6570 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006571 * Returns consumed size if succeed, returns -2 on decode error, or raise an
6572 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006573 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006574static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006575decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006576 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006577 const char *in,
6578 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006579{
Victor Stinner3a50e702011-10-18 21:21:00 +02006580 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006581 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006582 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006583
6584 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006585 assert(insize > 0);
6586 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6587 if (outsize <= 0)
6588 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006589
6590 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006591 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01006592 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006593 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006594 if (*v == NULL)
6595 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006596 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006597 }
6598 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006599 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006600 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01006601 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006602 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006603 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006604 }
6605
6606 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006607 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6608 if (outsize <= 0)
6609 goto error;
6610 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006611
Victor Stinner3a50e702011-10-18 21:21:00 +02006612error:
6613 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6614 return -2;
6615 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006616 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006617}
6618
Victor Stinner3a50e702011-10-18 21:21:00 +02006619/*
6620 * Decode a byte string from a code page into unicode object with an error
6621 * handler.
6622 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006623 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02006624 * UnicodeDecodeError exception and returns -1 on error.
6625 */
6626static int
6627decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006628 PyObject **v,
6629 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006630 const char *errors)
6631{
6632 const char *startin = in;
6633 const char *endin = in + size;
6634 const DWORD flags = decode_code_page_flags(code_page);
6635 /* Ideally, we should get reason from FormatMessage. This is the Windows
6636 2000 English version of the message. */
6637 const char *reason = "No mapping for the Unicode character exists "
6638 "in the target code page.";
6639 /* each step cannot decode more than 1 character, but a character can be
6640 represented as a surrogate pair */
6641 wchar_t buffer[2], *startout, *out;
6642 int insize, outsize;
6643 PyObject *errorHandler = NULL;
6644 PyObject *exc = NULL;
6645 PyObject *encoding_obj = NULL;
6646 char *encoding;
6647 DWORD err;
6648 int ret = -1;
6649
6650 assert(size > 0);
6651
6652 encoding = code_page_name(code_page, &encoding_obj);
6653 if (encoding == NULL)
6654 return -1;
6655
6656 if (errors == NULL || strcmp(errors, "strict") == 0) {
6657 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6658 UnicodeDecodeError. */
6659 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6660 if (exc != NULL) {
6661 PyCodec_StrictErrors(exc);
6662 Py_CLEAR(exc);
6663 }
6664 goto error;
6665 }
6666
6667 if (*v == NULL) {
6668 /* Create unicode object */
6669 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6670 PyErr_NoMemory();
6671 goto error;
6672 }
Victor Stinnerab595942011-12-17 04:59:06 +01006673 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006674 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006675 if (*v == NULL)
6676 goto error;
6677 startout = PyUnicode_AS_UNICODE(*v);
6678 }
6679 else {
6680 /* Extend unicode object */
6681 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6682 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6683 PyErr_NoMemory();
6684 goto error;
6685 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006686 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006687 goto error;
6688 startout = PyUnicode_AS_UNICODE(*v) + n;
6689 }
6690
6691 /* Decode the byte string character per character */
6692 out = startout;
6693 while (in < endin)
6694 {
6695 /* Decode a character */
6696 insize = 1;
6697 do
6698 {
6699 outsize = MultiByteToWideChar(code_page, flags,
6700 in, insize,
6701 buffer, Py_ARRAY_LENGTH(buffer));
6702 if (outsize > 0)
6703 break;
6704 err = GetLastError();
6705 if (err != ERROR_NO_UNICODE_TRANSLATION
6706 && err != ERROR_INSUFFICIENT_BUFFER)
6707 {
6708 PyErr_SetFromWindowsErr(0);
6709 goto error;
6710 }
6711 insize++;
6712 }
6713 /* 4=maximum length of a UTF-8 sequence */
6714 while (insize <= 4 && (in + insize) <= endin);
6715
6716 if (outsize <= 0) {
6717 Py_ssize_t startinpos, endinpos, outpos;
6718
6719 startinpos = in - startin;
6720 endinpos = startinpos + 1;
6721 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006722 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02006723 errors, &errorHandler,
6724 encoding, reason,
6725 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01006726 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02006727 {
6728 goto error;
6729 }
Victor Stinner596a6c42011-11-09 00:02:18 +01006730 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02006731 }
6732 else {
6733 in += insize;
6734 memcpy(out, buffer, outsize * sizeof(wchar_t));
6735 out += outsize;
6736 }
6737 }
6738
6739 /* write a NUL character at the end */
6740 *out = 0;
6741
6742 /* Extend unicode object */
6743 outsize = out - startout;
6744 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01006745 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006746 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01006747 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02006748
6749error:
6750 Py_XDECREF(encoding_obj);
6751 Py_XDECREF(errorHandler);
6752 Py_XDECREF(exc);
6753 return ret;
6754}
6755
Victor Stinner3a50e702011-10-18 21:21:00 +02006756static PyObject *
6757decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006758 const char *s, Py_ssize_t size,
6759 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006760{
Victor Stinner76a31a62011-11-04 00:05:13 +01006761 PyObject *v = NULL;
6762 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006763
Victor Stinner3a50e702011-10-18 21:21:00 +02006764 if (code_page < 0) {
6765 PyErr_SetString(PyExc_ValueError, "invalid code page number");
6766 return NULL;
6767 }
6768
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006769 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006770 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006771
Victor Stinner76a31a62011-11-04 00:05:13 +01006772 do
6773 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006774#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01006775 if (size > INT_MAX) {
6776 chunk_size = INT_MAX;
6777 final = 0;
6778 done = 0;
6779 }
6780 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006781#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01006782 {
6783 chunk_size = (int)size;
6784 final = (consumed == NULL);
6785 done = 1;
6786 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006787
Victor Stinner76a31a62011-11-04 00:05:13 +01006788 /* Skip trailing lead-byte unless 'final' is set */
6789 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
6790 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006791
Victor Stinner76a31a62011-11-04 00:05:13 +01006792 if (chunk_size == 0 && done) {
6793 if (v != NULL)
6794 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02006795 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01006796 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006797
Victor Stinner76a31a62011-11-04 00:05:13 +01006798
6799 converted = decode_code_page_strict(code_page, &v,
6800 s, chunk_size);
6801 if (converted == -2)
6802 converted = decode_code_page_errors(code_page, &v,
6803 s, chunk_size,
6804 errors);
6805 assert(converted != 0);
6806
6807 if (converted < 0) {
6808 Py_XDECREF(v);
6809 return NULL;
6810 }
6811
6812 if (consumed)
6813 *consumed += converted;
6814
6815 s += converted;
6816 size -= converted;
6817 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02006818
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006819 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006820}
6821
Alexander Belopolsky40018472011-02-26 01:02:56 +00006822PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02006823PyUnicode_DecodeCodePageStateful(int code_page,
6824 const char *s,
6825 Py_ssize_t size,
6826 const char *errors,
6827 Py_ssize_t *consumed)
6828{
6829 return decode_code_page_stateful(code_page, s, size, errors, consumed);
6830}
6831
6832PyObject *
6833PyUnicode_DecodeMBCSStateful(const char *s,
6834 Py_ssize_t size,
6835 const char *errors,
6836 Py_ssize_t *consumed)
6837{
6838 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
6839}
6840
6841PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006842PyUnicode_DecodeMBCS(const char *s,
6843 Py_ssize_t size,
6844 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006845{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006846 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6847}
6848
Victor Stinner3a50e702011-10-18 21:21:00 +02006849static DWORD
6850encode_code_page_flags(UINT code_page, const char *errors)
6851{
6852 if (code_page == CP_UTF8) {
6853 if (winver.dwMajorVersion >= 6)
6854 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
6855 and later */
6856 return WC_ERR_INVALID_CHARS;
6857 else
6858 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
6859 return 0;
6860 }
6861 else if (code_page == CP_UTF7) {
6862 /* CP_UTF7 only supports flags=0 */
6863 return 0;
6864 }
6865 else {
6866 if (errors != NULL && strcmp(errors, "replace") == 0)
6867 return 0;
6868 else
6869 return WC_NO_BEST_FIT_CHARS;
6870 }
6871}
6872
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006873/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006874 * Encode a Unicode string to a Windows code page into a byte string in strict
6875 * mode.
6876 *
6877 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006878 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006879 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006880static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006881encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01006882 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02006883 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006884{
Victor Stinner554f3f02010-06-16 23:33:54 +00006885 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02006886 BOOL *pusedDefaultChar = &usedDefaultChar;
6887 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006888 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01006889 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006890 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02006891 const DWORD flags = encode_code_page_flags(code_page, NULL);
6892 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006893 /* Create a substring so that we can get the UTF-16 representation
6894 of just the slice under consideration. */
6895 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006896
Martin v. Löwis3d325192011-11-04 18:23:06 +01006897 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006898
Victor Stinner3a50e702011-10-18 21:21:00 +02006899 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00006900 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02006901 else
Victor Stinner554f3f02010-06-16 23:33:54 +00006902 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00006903
Victor Stinner2fc507f2011-11-04 20:06:39 +01006904 substring = PyUnicode_Substring(unicode, offset, offset+len);
6905 if (substring == NULL)
6906 return -1;
6907 p = PyUnicode_AsUnicodeAndSize(substring, &size);
6908 if (p == NULL) {
6909 Py_DECREF(substring);
6910 return -1;
6911 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01006912
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006913 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006914 outsize = WideCharToMultiByte(code_page, flags,
6915 p, size,
6916 NULL, 0,
6917 NULL, pusedDefaultChar);
6918 if (outsize <= 0)
6919 goto error;
6920 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01006921 if (pusedDefaultChar && *pusedDefaultChar) {
6922 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006923 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006924 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006925
Victor Stinner3a50e702011-10-18 21:21:00 +02006926 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006927 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006928 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01006929 if (*outbytes == NULL) {
6930 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00006931 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006932 }
Victor Stinner3a50e702011-10-18 21:21:00 +02006933 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006934 }
6935 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006936 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006937 const Py_ssize_t n = PyBytes_Size(*outbytes);
6938 if (outsize > PY_SSIZE_T_MAX - n) {
6939 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01006940 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00006941 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006942 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01006943 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
6944 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006945 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006946 }
Victor Stinner3a50e702011-10-18 21:21:00 +02006947 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006948 }
6949
6950 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006951 outsize = WideCharToMultiByte(code_page, flags,
6952 p, size,
6953 out, outsize,
6954 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01006955 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006956 if (outsize <= 0)
6957 goto error;
6958 if (pusedDefaultChar && *pusedDefaultChar)
6959 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006960 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006961
Victor Stinner3a50e702011-10-18 21:21:00 +02006962error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01006963 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006964 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6965 return -2;
6966 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006967 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006968}
6969
Victor Stinner3a50e702011-10-18 21:21:00 +02006970/*
6971 * Encode a Unicode string to a Windows code page into a byte string using a
6972 * error handler.
6973 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006974 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02006975 * -1 on other error.
6976 */
6977static int
6978encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01006979 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01006980 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006981{
Victor Stinner3a50e702011-10-18 21:21:00 +02006982 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01006983 Py_ssize_t pos = unicode_offset;
6984 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02006985 /* Ideally, we should get reason from FormatMessage. This is the Windows
6986 2000 English version of the message. */
6987 const char *reason = "invalid character";
6988 /* 4=maximum length of a UTF-8 sequence */
6989 char buffer[4];
6990 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
6991 Py_ssize_t outsize;
6992 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006993 PyObject *errorHandler = NULL;
6994 PyObject *exc = NULL;
6995 PyObject *encoding_obj = NULL;
6996 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01006997 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02006998 PyObject *rep;
6999 int ret = -1;
7000
7001 assert(insize > 0);
7002
7003 encoding = code_page_name(code_page, &encoding_obj);
7004 if (encoding == NULL)
7005 return -1;
7006
7007 if (errors == NULL || strcmp(errors, "strict") == 0) {
7008 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7009 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007010 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007011 if (exc != NULL) {
7012 PyCodec_StrictErrors(exc);
7013 Py_DECREF(exc);
7014 }
7015 Py_XDECREF(encoding_obj);
7016 return -1;
7017 }
7018
7019 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7020 pusedDefaultChar = &usedDefaultChar;
7021 else
7022 pusedDefaultChar = NULL;
7023
7024 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7025 PyErr_NoMemory();
7026 goto error;
7027 }
7028 outsize = insize * Py_ARRAY_LENGTH(buffer);
7029
7030 if (*outbytes == NULL) {
7031 /* Create string object */
7032 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7033 if (*outbytes == NULL)
7034 goto error;
7035 out = PyBytes_AS_STRING(*outbytes);
7036 }
7037 else {
7038 /* Extend string object */
7039 Py_ssize_t n = PyBytes_Size(*outbytes);
7040 if (n > PY_SSIZE_T_MAX - outsize) {
7041 PyErr_NoMemory();
7042 goto error;
7043 }
7044 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7045 goto error;
7046 out = PyBytes_AS_STRING(*outbytes) + n;
7047 }
7048
7049 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007050 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007051 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007052 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7053 wchar_t chars[2];
7054 int charsize;
7055 if (ch < 0x10000) {
7056 chars[0] = (wchar_t)ch;
7057 charsize = 1;
7058 }
7059 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007060 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7061 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007062 charsize = 2;
7063 }
7064
Victor Stinner3a50e702011-10-18 21:21:00 +02007065 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007066 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007067 buffer, Py_ARRAY_LENGTH(buffer),
7068 NULL, pusedDefaultChar);
7069 if (outsize > 0) {
7070 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7071 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007072 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007073 memcpy(out, buffer, outsize);
7074 out += outsize;
7075 continue;
7076 }
7077 }
7078 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7079 PyErr_SetFromWindowsErr(0);
7080 goto error;
7081 }
7082
Victor Stinner3a50e702011-10-18 21:21:00 +02007083 rep = unicode_encode_call_errorhandler(
7084 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007085 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007086 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007087 if (rep == NULL)
7088 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007089 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007090
7091 if (PyBytes_Check(rep)) {
7092 outsize = PyBytes_GET_SIZE(rep);
7093 if (outsize != 1) {
7094 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7095 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7096 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7097 Py_DECREF(rep);
7098 goto error;
7099 }
7100 out = PyBytes_AS_STRING(*outbytes) + offset;
7101 }
7102 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7103 out += outsize;
7104 }
7105 else {
7106 Py_ssize_t i;
7107 enum PyUnicode_Kind kind;
7108 void *data;
7109
Benjamin Petersonbac79492012-01-14 13:34:47 -05007110 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007111 Py_DECREF(rep);
7112 goto error;
7113 }
7114
7115 outsize = PyUnicode_GET_LENGTH(rep);
7116 if (outsize != 1) {
7117 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7118 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7119 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7120 Py_DECREF(rep);
7121 goto error;
7122 }
7123 out = PyBytes_AS_STRING(*outbytes) + offset;
7124 }
7125 kind = PyUnicode_KIND(rep);
7126 data = PyUnicode_DATA(rep);
7127 for (i=0; i < outsize; i++) {
7128 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7129 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007130 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007131 encoding, unicode,
7132 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007133 "unable to encode error handler result to ASCII");
7134 Py_DECREF(rep);
7135 goto error;
7136 }
7137 *out = (unsigned char)ch;
7138 out++;
7139 }
7140 }
7141 Py_DECREF(rep);
7142 }
7143 /* write a NUL byte */
7144 *out = 0;
7145 outsize = out - PyBytes_AS_STRING(*outbytes);
7146 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7147 if (_PyBytes_Resize(outbytes, outsize) < 0)
7148 goto error;
7149 ret = 0;
7150
7151error:
7152 Py_XDECREF(encoding_obj);
7153 Py_XDECREF(errorHandler);
7154 Py_XDECREF(exc);
7155 return ret;
7156}
7157
Victor Stinner3a50e702011-10-18 21:21:00 +02007158static PyObject *
7159encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007160 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007161 const char *errors)
7162{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007163 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007164 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007165 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007166 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007167
Benjamin Petersonbac79492012-01-14 13:34:47 -05007168 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007169 return NULL;
7170 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007171
Victor Stinner3a50e702011-10-18 21:21:00 +02007172 if (code_page < 0) {
7173 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7174 return NULL;
7175 }
7176
Martin v. Löwis3d325192011-11-04 18:23:06 +01007177 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007178 return PyBytes_FromStringAndSize(NULL, 0);
7179
Victor Stinner7581cef2011-11-03 22:32:33 +01007180 offset = 0;
7181 do
7182 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007183#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007184 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007185 chunks. */
7186 if (len > INT_MAX/2) {
7187 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007188 done = 0;
7189 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007190 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007191#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007192 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007193 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007194 done = 1;
7195 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007196
Victor Stinner76a31a62011-11-04 00:05:13 +01007197 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007198 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007199 errors);
7200 if (ret == -2)
7201 ret = encode_code_page_errors(code_page, &outbytes,
7202 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007203 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007204 if (ret < 0) {
7205 Py_XDECREF(outbytes);
7206 return NULL;
7207 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007208
Victor Stinner7581cef2011-11-03 22:32:33 +01007209 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007210 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007211 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007212
Victor Stinner3a50e702011-10-18 21:21:00 +02007213 return outbytes;
7214}
7215
7216PyObject *
7217PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7218 Py_ssize_t size,
7219 const char *errors)
7220{
Victor Stinner7581cef2011-11-03 22:32:33 +01007221 PyObject *unicode, *res;
7222 unicode = PyUnicode_FromUnicode(p, size);
7223 if (unicode == NULL)
7224 return NULL;
7225 res = encode_code_page(CP_ACP, unicode, errors);
7226 Py_DECREF(unicode);
7227 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007228}
7229
7230PyObject *
7231PyUnicode_EncodeCodePage(int code_page,
7232 PyObject *unicode,
7233 const char *errors)
7234{
Victor Stinner7581cef2011-11-03 22:32:33 +01007235 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007236}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007237
Alexander Belopolsky40018472011-02-26 01:02:56 +00007238PyObject *
7239PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007240{
7241 if (!PyUnicode_Check(unicode)) {
7242 PyErr_BadArgument();
7243 return NULL;
7244 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007245 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007246}
7247
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007248#undef NEED_RETRY
7249
Victor Stinner99b95382011-07-04 14:23:54 +02007250#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007251
Guido van Rossumd57fd912000-03-10 22:53:23 +00007252/* --- Character Mapping Codec -------------------------------------------- */
7253
Alexander Belopolsky40018472011-02-26 01:02:56 +00007254PyObject *
7255PyUnicode_DecodeCharmap(const char *s,
7256 Py_ssize_t size,
7257 PyObject *mapping,
7258 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007259{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007260 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007261 Py_ssize_t startinpos;
7262 Py_ssize_t endinpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007263 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007264 _PyUnicodeWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007265 PyObject *errorHandler = NULL;
7266 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007267
Guido van Rossumd57fd912000-03-10 22:53:23 +00007268 /* Default to Latin-1 */
7269 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007270 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007271
Guido van Rossumd57fd912000-03-10 22:53:23 +00007272 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02007273 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007274 _PyUnicodeWriter_Init(&writer, 0);
7275 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007276 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007277
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007278 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007279 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007280 Py_ssize_t maplen;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007281 enum PyUnicode_Kind mapkind;
7282 void *mapdata;
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007283 Py_UCS4 x;
7284
Benjamin Petersonbac79492012-01-14 13:34:47 -05007285 if (PyUnicode_READY(mapping) == -1)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007286 return NULL;
7287
7288 maplen = PyUnicode_GET_LENGTH(mapping);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007289 mapdata = PyUnicode_DATA(mapping);
7290 mapkind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007291 while (s < e) {
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007292 unsigned char ch;
7293 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007294 enum PyUnicode_Kind outkind = writer.kind;
7295 void *outdata = writer.data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007296 if (outkind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007297 Py_UCS4 maxchar = writer.maxchar;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007298 while (s < e) {
7299 unsigned char ch = *s;
7300 x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch);
7301 if (x > maxchar)
7302 goto Error;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007303 PyUnicode_WRITE(PyUnicode_1BYTE_KIND, outdata, writer.pos, x);
7304 writer.pos++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007305 ++s;
7306 }
7307 break;
7308 }
7309 else if (outkind == PyUnicode_2BYTE_KIND) {
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007310 while (s < e) {
7311 unsigned char ch = *s;
7312 x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch);
7313 if (x == 0xFFFE)
7314 goto Error;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007315 PyUnicode_WRITE(PyUnicode_2BYTE_KIND, outdata, writer.pos, x);
7316 writer.pos++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007317 ++s;
7318 }
7319 break;
7320 }
7321 }
7322 ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007323
Benjamin Peterson29060642009-01-31 22:14:21 +00007324 if (ch < maplen)
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007325 x = PyUnicode_READ(mapkind, mapdata, ch);
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007326 else
7327 x = 0xfffe; /* invalid value */
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007328Error:
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007329 if (x == 0xfffe)
7330 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007331 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007332 startinpos = s-starts;
7333 endinpos = startinpos+1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007334 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00007335 errors, &errorHandler,
7336 "charmap", "character maps to <undefined>",
7337 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007338 &writer)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007339 goto onError;
7340 }
7341 continue;
7342 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007343
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007344 if (_PyUnicodeWriter_Prepare(&writer, 1, x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007345 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007346 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, x);
7347 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007348 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007349 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007350 }
7351 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007352 while (s < e) {
7353 unsigned char ch = *s;
7354 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007355
Benjamin Peterson29060642009-01-31 22:14:21 +00007356 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7357 w = PyLong_FromLong((long)ch);
7358 if (w == NULL)
7359 goto onError;
7360 x = PyObject_GetItem(mapping, w);
7361 Py_DECREF(w);
7362 if (x == NULL) {
7363 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7364 /* No mapping found means: mapping is undefined. */
7365 PyErr_Clear();
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007366 goto Undefined;
Benjamin Peterson29060642009-01-31 22:14:21 +00007367 } else
7368 goto onError;
7369 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007370
Benjamin Peterson29060642009-01-31 22:14:21 +00007371 /* Apply mapping */
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007372 if (x == Py_None)
7373 goto Undefined;
Benjamin Peterson29060642009-01-31 22:14:21 +00007374 if (PyLong_Check(x)) {
7375 long value = PyLong_AS_LONG(x);
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007376 if (value == 0xFFFE)
7377 goto Undefined;
Antoine Pitroua1f76552012-09-23 20:00:04 +02007378 if (value < 0 || value > MAX_UNICODE) {
7379 PyErr_Format(PyExc_TypeError,
7380 "character mapping must be in range(0x%lx)",
7381 (unsigned long)MAX_UNICODE + 1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007382 Py_DECREF(x);
7383 goto onError;
7384 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007385
Serhiy Storchaka2aee6a62013-01-29 12:16:57 +02007386 if (_PyUnicodeWriter_Prepare(&writer, 1, value) == -1) {
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007387 Py_DECREF(x);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007388 goto onError;
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007389 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007390 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, value);
7391 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007392 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007393 else if (PyUnicode_Check(x)) {
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007394 if (PyUnicode_READY(x) == -1) {
7395 Py_DECREF(x);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007396 goto onError;
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007397 }
Serhiy Storchaka55e2cb42013-01-15 15:30:04 +02007398 if (PyUnicode_GET_LENGTH(x) == 1) {
Serhiy Storchaka45d16d92013-01-15 15:01:20 +02007399 Py_UCS4 value = PyUnicode_READ_CHAR(x, 0);
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007400 if (value == 0xFFFE)
7401 goto Undefined;
Serhiy Storchaka2aee6a62013-01-29 12:16:57 +02007402 if (_PyUnicodeWriter_Prepare(&writer, 1, value) == -1) {
7403 Py_DECREF(x);
Serhiy Storchaka55e2cb42013-01-15 15:30:04 +02007404 goto onError;
Serhiy Storchaka2aee6a62013-01-29 12:16:57 +02007405 }
Serhiy Storchaka55e2cb42013-01-15 15:30:04 +02007406 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, value);
7407 writer.pos++;
7408 }
7409 else {
7410 writer.overallocate = 1;
Serhiy Storchaka2aee6a62013-01-29 12:16:57 +02007411 if (_PyUnicodeWriter_WriteStr(&writer, x) == -1) {
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007412 Py_DECREF(x);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007413 goto onError;
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007414 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007415 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007416 }
7417 else {
7418 /* wrong return value */
7419 PyErr_SetString(PyExc_TypeError,
7420 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007421 Py_DECREF(x);
7422 goto onError;
7423 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007424 Py_DECREF(x);
7425 ++s;
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007426 continue;
7427Undefined:
7428 /* undefined mapping */
7429 Py_XDECREF(x);
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007430 startinpos = s-starts;
7431 endinpos = startinpos+1;
Serhiy Storchaka55e2cb42013-01-15 15:30:04 +02007432 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007433 errors, &errorHandler,
7434 "charmap", "character maps to <undefined>",
7435 &starts, &e, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka55e2cb42013-01-15 15:30:04 +02007436 &writer)) {
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007437 goto onError;
7438 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007439 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007440 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007441 Py_XDECREF(errorHandler);
7442 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007443 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007444
Benjamin Peterson29060642009-01-31 22:14:21 +00007445 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007446 Py_XDECREF(errorHandler);
7447 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007448 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007449 return NULL;
7450}
7451
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007452/* Charmap encoding: the lookup table */
7453
Alexander Belopolsky40018472011-02-26 01:02:56 +00007454struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007455 PyObject_HEAD
7456 unsigned char level1[32];
7457 int count2, count3;
7458 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007459};
7460
7461static PyObject*
7462encoding_map_size(PyObject *obj, PyObject* args)
7463{
7464 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007465 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007466 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007467}
7468
7469static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007470 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007471 PyDoc_STR("Return the size (in bytes) of this object") },
7472 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007473};
7474
7475static void
7476encoding_map_dealloc(PyObject* o)
7477{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007478 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007479}
7480
7481static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007482 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007483 "EncodingMap", /*tp_name*/
7484 sizeof(struct encoding_map), /*tp_basicsize*/
7485 0, /*tp_itemsize*/
7486 /* methods */
7487 encoding_map_dealloc, /*tp_dealloc*/
7488 0, /*tp_print*/
7489 0, /*tp_getattr*/
7490 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007491 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007492 0, /*tp_repr*/
7493 0, /*tp_as_number*/
7494 0, /*tp_as_sequence*/
7495 0, /*tp_as_mapping*/
7496 0, /*tp_hash*/
7497 0, /*tp_call*/
7498 0, /*tp_str*/
7499 0, /*tp_getattro*/
7500 0, /*tp_setattro*/
7501 0, /*tp_as_buffer*/
7502 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7503 0, /*tp_doc*/
7504 0, /*tp_traverse*/
7505 0, /*tp_clear*/
7506 0, /*tp_richcompare*/
7507 0, /*tp_weaklistoffset*/
7508 0, /*tp_iter*/
7509 0, /*tp_iternext*/
7510 encoding_map_methods, /*tp_methods*/
7511 0, /*tp_members*/
7512 0, /*tp_getset*/
7513 0, /*tp_base*/
7514 0, /*tp_dict*/
7515 0, /*tp_descr_get*/
7516 0, /*tp_descr_set*/
7517 0, /*tp_dictoffset*/
7518 0, /*tp_init*/
7519 0, /*tp_alloc*/
7520 0, /*tp_new*/
7521 0, /*tp_free*/
7522 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007523};
7524
7525PyObject*
7526PyUnicode_BuildEncodingMap(PyObject* string)
7527{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007528 PyObject *result;
7529 struct encoding_map *mresult;
7530 int i;
7531 int need_dict = 0;
7532 unsigned char level1[32];
7533 unsigned char level2[512];
7534 unsigned char *mlevel1, *mlevel2, *mlevel3;
7535 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007536 int kind;
7537 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007538 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007539 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007540
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007541 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007542 PyErr_BadArgument();
7543 return NULL;
7544 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007545 kind = PyUnicode_KIND(string);
7546 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007547 length = PyUnicode_GET_LENGTH(string);
7548 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007549 memset(level1, 0xFF, sizeof level1);
7550 memset(level2, 0xFF, sizeof level2);
7551
7552 /* If there isn't a one-to-one mapping of NULL to \0,
7553 or if there are non-BMP characters, we need to use
7554 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007555 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007556 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007557 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007558 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007559 ch = PyUnicode_READ(kind, data, i);
7560 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007561 need_dict = 1;
7562 break;
7563 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007564 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007565 /* unmapped character */
7566 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007567 l1 = ch >> 11;
7568 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007569 if (level1[l1] == 0xFF)
7570 level1[l1] = count2++;
7571 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007572 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007573 }
7574
7575 if (count2 >= 0xFF || count3 >= 0xFF)
7576 need_dict = 1;
7577
7578 if (need_dict) {
7579 PyObject *result = PyDict_New();
7580 PyObject *key, *value;
7581 if (!result)
7582 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007583 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007584 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007585 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007586 if (!key || !value)
7587 goto failed1;
7588 if (PyDict_SetItem(result, key, value) == -1)
7589 goto failed1;
7590 Py_DECREF(key);
7591 Py_DECREF(value);
7592 }
7593 return result;
7594 failed1:
7595 Py_XDECREF(key);
7596 Py_XDECREF(value);
7597 Py_DECREF(result);
7598 return NULL;
7599 }
7600
7601 /* Create a three-level trie */
7602 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7603 16*count2 + 128*count3 - 1);
7604 if (!result)
7605 return PyErr_NoMemory();
7606 PyObject_Init(result, &EncodingMapType);
7607 mresult = (struct encoding_map*)result;
7608 mresult->count2 = count2;
7609 mresult->count3 = count3;
7610 mlevel1 = mresult->level1;
7611 mlevel2 = mresult->level23;
7612 mlevel3 = mresult->level23 + 16*count2;
7613 memcpy(mlevel1, level1, 32);
7614 memset(mlevel2, 0xFF, 16*count2);
7615 memset(mlevel3, 0, 128*count3);
7616 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007617 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007618 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007619 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7620 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007621 /* unmapped character */
7622 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007623 o1 = ch>>11;
7624 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007625 i2 = 16*mlevel1[o1] + o2;
7626 if (mlevel2[i2] == 0xFF)
7627 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007628 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007629 i3 = 128*mlevel2[i2] + o3;
7630 mlevel3[i3] = i;
7631 }
7632 return result;
7633}
7634
7635static int
Victor Stinner22168992011-11-20 17:09:18 +01007636encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007637{
7638 struct encoding_map *map = (struct encoding_map*)mapping;
7639 int l1 = c>>11;
7640 int l2 = (c>>7) & 0xF;
7641 int l3 = c & 0x7F;
7642 int i;
7643
Victor Stinner22168992011-11-20 17:09:18 +01007644 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007645 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007646 if (c == 0)
7647 return 0;
7648 /* level 1*/
7649 i = map->level1[l1];
7650 if (i == 0xFF) {
7651 return -1;
7652 }
7653 /* level 2*/
7654 i = map->level23[16*i+l2];
7655 if (i == 0xFF) {
7656 return -1;
7657 }
7658 /* level 3 */
7659 i = map->level23[16*map->count2 + 128*i + l3];
7660 if (i == 0) {
7661 return -1;
7662 }
7663 return i;
7664}
7665
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007666/* Lookup the character ch in the mapping. If the character
7667 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007668 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007669static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007670charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007671{
Christian Heimes217cfd12007-12-02 14:31:20 +00007672 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007673 PyObject *x;
7674
7675 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007676 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007677 x = PyObject_GetItem(mapping, w);
7678 Py_DECREF(w);
7679 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007680 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7681 /* No mapping found means: mapping is undefined. */
7682 PyErr_Clear();
7683 x = Py_None;
7684 Py_INCREF(x);
7685 return x;
7686 } else
7687 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007688 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007689 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007690 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007691 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007692 long value = PyLong_AS_LONG(x);
7693 if (value < 0 || value > 255) {
7694 PyErr_SetString(PyExc_TypeError,
7695 "character mapping must be in range(256)");
7696 Py_DECREF(x);
7697 return NULL;
7698 }
7699 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007700 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007701 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007702 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007703 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007704 /* wrong return value */
7705 PyErr_Format(PyExc_TypeError,
7706 "character mapping must return integer, bytes or None, not %.400s",
7707 x->ob_type->tp_name);
7708 Py_DECREF(x);
7709 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007710 }
7711}
7712
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007713static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007714charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007715{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007716 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7717 /* exponentially overallocate to minimize reallocations */
7718 if (requiredsize < 2*outsize)
7719 requiredsize = 2*outsize;
7720 if (_PyBytes_Resize(outobj, requiredsize))
7721 return -1;
7722 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007723}
7724
Benjamin Peterson14339b62009-01-31 16:36:08 +00007725typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007726 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007727} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007728/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007729 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007730 space is available. Return a new reference to the object that
7731 was put in the output buffer, or Py_None, if the mapping was undefined
7732 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007733 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007734static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01007735charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007736 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007737{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007738 PyObject *rep;
7739 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007740 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007741
Christian Heimes90aa7642007-12-19 02:45:37 +00007742 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007743 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007744 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007745 if (res == -1)
7746 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007747 if (outsize<requiredsize)
7748 if (charmapencode_resize(outobj, outpos, requiredsize))
7749 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007750 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007751 outstart[(*outpos)++] = (char)res;
7752 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007753 }
7754
7755 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007756 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007757 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007758 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007759 Py_DECREF(rep);
7760 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007761 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007762 if (PyLong_Check(rep)) {
7763 Py_ssize_t requiredsize = *outpos+1;
7764 if (outsize<requiredsize)
7765 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7766 Py_DECREF(rep);
7767 return enc_EXCEPTION;
7768 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007769 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007770 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007771 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007772 else {
7773 const char *repchars = PyBytes_AS_STRING(rep);
7774 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7775 Py_ssize_t requiredsize = *outpos+repsize;
7776 if (outsize<requiredsize)
7777 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7778 Py_DECREF(rep);
7779 return enc_EXCEPTION;
7780 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007781 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007782 memcpy(outstart + *outpos, repchars, repsize);
7783 *outpos += repsize;
7784 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007785 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007786 Py_DECREF(rep);
7787 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007788}
7789
7790/* handle an error in PyUnicode_EncodeCharmap
7791 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007792static int
7793charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007794 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007795 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007796 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007797 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007798{
7799 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007800 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007801 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007802 enum PyUnicode_Kind kind;
7803 void *data;
7804 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007805 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007806 Py_ssize_t collstartpos = *inpos;
7807 Py_ssize_t collendpos = *inpos+1;
7808 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007809 char *encoding = "charmap";
7810 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007811 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007812 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05007813 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007814
Benjamin Petersonbac79492012-01-14 13:34:47 -05007815 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007816 return -1;
7817 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007818 /* find all unencodable characters */
7819 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007820 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007821 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007822 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05007823 val = encoding_map_lookup(ch, mapping);
7824 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007825 break;
7826 ++collendpos;
7827 continue;
7828 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007829
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007830 ch = PyUnicode_READ_CHAR(unicode, collendpos);
7831 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007832 if (rep==NULL)
7833 return -1;
7834 else if (rep!=Py_None) {
7835 Py_DECREF(rep);
7836 break;
7837 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007838 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007839 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007840 }
7841 /* cache callback name lookup
7842 * (if not done yet, i.e. it's the first error) */
7843 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007844 if ((errors==NULL) || (!strcmp(errors, "strict")))
7845 *known_errorHandler = 1;
7846 else if (!strcmp(errors, "replace"))
7847 *known_errorHandler = 2;
7848 else if (!strcmp(errors, "ignore"))
7849 *known_errorHandler = 3;
7850 else if (!strcmp(errors, "xmlcharrefreplace"))
7851 *known_errorHandler = 4;
7852 else
7853 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007854 }
7855 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007856 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007857 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007858 return -1;
7859 case 2: /* replace */
7860 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007861 x = charmapencode_output('?', mapping, res, respos);
7862 if (x==enc_EXCEPTION) {
7863 return -1;
7864 }
7865 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007866 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007867 return -1;
7868 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007869 }
7870 /* fall through */
7871 case 3: /* ignore */
7872 *inpos = collendpos;
7873 break;
7874 case 4: /* xmlcharrefreplace */
7875 /* generate replacement (temporarily (mis)uses p) */
7876 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007877 char buffer[2+29+1+1];
7878 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007879 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00007880 for (cp = buffer; *cp; ++cp) {
7881 x = charmapencode_output(*cp, mapping, res, respos);
7882 if (x==enc_EXCEPTION)
7883 return -1;
7884 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007885 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007886 return -1;
7887 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007888 }
7889 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007890 *inpos = collendpos;
7891 break;
7892 default:
7893 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007894 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00007895 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007896 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007897 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007898 if (PyBytes_Check(repunicode)) {
7899 /* Directly copy bytes result to output. */
7900 Py_ssize_t outsize = PyBytes_Size(*res);
7901 Py_ssize_t requiredsize;
7902 repsize = PyBytes_Size(repunicode);
7903 requiredsize = *respos + repsize;
7904 if (requiredsize > outsize)
7905 /* Make room for all additional bytes. */
7906 if (charmapencode_resize(res, respos, requiredsize)) {
7907 Py_DECREF(repunicode);
7908 return -1;
7909 }
7910 memcpy(PyBytes_AsString(*res) + *respos,
7911 PyBytes_AsString(repunicode), repsize);
7912 *respos += repsize;
7913 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007914 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007915 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007916 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007917 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05007918 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007919 Py_DECREF(repunicode);
7920 return -1;
7921 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01007922 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007923 data = PyUnicode_DATA(repunicode);
7924 kind = PyUnicode_KIND(repunicode);
7925 for (index = 0; index < repsize; index++) {
7926 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
7927 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00007928 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007929 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00007930 return -1;
7931 }
7932 else if (x==enc_FAILED) {
7933 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007934 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007935 return -1;
7936 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007937 }
7938 *inpos = newpos;
7939 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007940 }
7941 return 0;
7942}
7943
Alexander Belopolsky40018472011-02-26 01:02:56 +00007944PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007945_PyUnicode_EncodeCharmap(PyObject *unicode,
7946 PyObject *mapping,
7947 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007948{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007949 /* output object */
7950 PyObject *res = NULL;
7951 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007952 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007953 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007954 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007955 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007956 PyObject *errorHandler = NULL;
7957 PyObject *exc = NULL;
7958 /* the following variable is used for caching string comparisons
7959 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7960 * 3=ignore, 4=xmlcharrefreplace */
7961 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007962
Benjamin Petersonbac79492012-01-14 13:34:47 -05007963 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007964 return NULL;
7965 size = PyUnicode_GET_LENGTH(unicode);
7966
Guido van Rossumd57fd912000-03-10 22:53:23 +00007967 /* Default to Latin-1 */
7968 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007969 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007970
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007971 /* allocate enough for a simple encoding without
7972 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007973 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007974 if (res == NULL)
7975 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007976 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007977 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007978
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007979 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007980 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00007981 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007982 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00007983 if (x==enc_EXCEPTION) /* error */
7984 goto onError;
7985 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007986 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00007987 &exc,
7988 &known_errorHandler, &errorHandler, errors,
7989 &res, &respos)) {
7990 goto onError;
7991 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007992 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007993 else
7994 /* done with this character => adjust input position */
7995 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007996 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007997
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007998 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007999 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008000 if (_PyBytes_Resize(&res, respos) < 0)
8001 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008002
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008003 Py_XDECREF(exc);
8004 Py_XDECREF(errorHandler);
8005 return res;
8006
Benjamin Peterson29060642009-01-31 22:14:21 +00008007 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008008 Py_XDECREF(res);
8009 Py_XDECREF(exc);
8010 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008011 return NULL;
8012}
8013
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008014/* Deprecated */
8015PyObject *
8016PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8017 Py_ssize_t size,
8018 PyObject *mapping,
8019 const char *errors)
8020{
8021 PyObject *result;
8022 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8023 if (unicode == NULL)
8024 return NULL;
8025 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8026 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008027 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008028}
8029
Alexander Belopolsky40018472011-02-26 01:02:56 +00008030PyObject *
8031PyUnicode_AsCharmapString(PyObject *unicode,
8032 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008033{
8034 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008035 PyErr_BadArgument();
8036 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008037 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008038 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008039}
8040
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008041/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008042static void
8043make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008044 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008045 Py_ssize_t startpos, Py_ssize_t endpos,
8046 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008047{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008048 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008049 *exceptionObject = _PyUnicodeTranslateError_Create(
8050 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008051 }
8052 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008053 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8054 goto onError;
8055 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8056 goto onError;
8057 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8058 goto onError;
8059 return;
8060 onError:
8061 Py_DECREF(*exceptionObject);
8062 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008063 }
8064}
8065
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008066/* error handling callback helper:
8067 build arguments, call the callback and check the arguments,
8068 put the result into newpos and return the replacement string, which
8069 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008070static PyObject *
8071unicode_translate_call_errorhandler(const char *errors,
8072 PyObject **errorHandler,
8073 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008074 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008075 Py_ssize_t startpos, Py_ssize_t endpos,
8076 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008077{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008078 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008079
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008080 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008081 PyObject *restuple;
8082 PyObject *resunicode;
8083
8084 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008085 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008086 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008087 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008088 }
8089
8090 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008091 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008092 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008093 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008094
8095 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008096 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008097 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008098 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008099 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008100 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008101 Py_DECREF(restuple);
8102 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008103 }
8104 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008105 &resunicode, &i_newpos)) {
8106 Py_DECREF(restuple);
8107 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008108 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008109 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008110 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008111 else
8112 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008113 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008114 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8115 Py_DECREF(restuple);
8116 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008117 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008118 Py_INCREF(resunicode);
8119 Py_DECREF(restuple);
8120 return resunicode;
8121}
8122
8123/* Lookup the character ch in the mapping and put the result in result,
8124 which must be decrefed by the caller.
8125 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008126static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008127charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008128{
Christian Heimes217cfd12007-12-02 14:31:20 +00008129 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008130 PyObject *x;
8131
8132 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008133 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008134 x = PyObject_GetItem(mapping, w);
8135 Py_DECREF(w);
8136 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008137 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8138 /* No mapping found means: use 1:1 mapping. */
8139 PyErr_Clear();
8140 *result = NULL;
8141 return 0;
8142 } else
8143 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008144 }
8145 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008146 *result = x;
8147 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008148 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008149 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008150 long value = PyLong_AS_LONG(x);
8151 long max = PyUnicode_GetMax();
8152 if (value < 0 || value > max) {
8153 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008154 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008155 Py_DECREF(x);
8156 return -1;
8157 }
8158 *result = x;
8159 return 0;
8160 }
8161 else if (PyUnicode_Check(x)) {
8162 *result = x;
8163 return 0;
8164 }
8165 else {
8166 /* wrong return value */
8167 PyErr_SetString(PyExc_TypeError,
8168 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008169 Py_DECREF(x);
8170 return -1;
8171 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008172}
8173/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008174 if not reallocate and adjust various state variables.
8175 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008176static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008177charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008178 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008179{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008180 Py_ssize_t oldsize = *psize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008181 Py_UCS4 *new_outobj;
Walter Dörwald4894c302003-10-24 14:25:28 +00008182 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008183 /* exponentially overallocate to minimize reallocations */
8184 if (requiredsize < 2 * oldsize)
8185 requiredsize = 2 * oldsize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008186 new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8187 if (new_outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008188 return -1;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008189 *outobj = new_outobj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008190 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008191 }
8192 return 0;
8193}
8194/* lookup the character, put the result in the output string and adjust
8195 various state variables. Return a new reference to the object that
8196 was put in the output buffer in *result, or Py_None, if the mapping was
8197 undefined (in which case no character was written).
8198 The called must decref result.
8199 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008200static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008201charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8202 PyObject *mapping, Py_UCS4 **output,
8203 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008204 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008205{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008206 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8207 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008208 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008209 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008210 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008211 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008212 }
8213 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008214 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008215 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008216 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008217 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008218 }
8219 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008220 Py_ssize_t repsize;
8221 if (PyUnicode_READY(*res) == -1)
8222 return -1;
8223 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008224 if (repsize==1) {
8225 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008226 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008227 }
8228 else if (repsize!=0) {
8229 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008230 Py_ssize_t requiredsize = *opos +
8231 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008232 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008233 Py_ssize_t i;
8234 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008235 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008236 for(i = 0; i < repsize; i++)
8237 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008238 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008239 }
8240 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008241 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008242 return 0;
8243}
8244
Alexander Belopolsky40018472011-02-26 01:02:56 +00008245PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008246_PyUnicode_TranslateCharmap(PyObject *input,
8247 PyObject *mapping,
8248 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008249{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008250 /* input object */
8251 char *idata;
8252 Py_ssize_t size, i;
8253 int kind;
8254 /* output buffer */
8255 Py_UCS4 *output = NULL;
8256 Py_ssize_t osize;
8257 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008258 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008259 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008260 char *reason = "character maps to <undefined>";
8261 PyObject *errorHandler = NULL;
8262 PyObject *exc = NULL;
8263 /* the following variable is used for caching string comparisons
8264 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8265 * 3=ignore, 4=xmlcharrefreplace */
8266 int known_errorHandler = -1;
8267
Guido van Rossumd57fd912000-03-10 22:53:23 +00008268 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008269 PyErr_BadArgument();
8270 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008271 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008272
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008273 if (PyUnicode_READY(input) == -1)
8274 return NULL;
8275 idata = (char*)PyUnicode_DATA(input);
8276 kind = PyUnicode_KIND(input);
8277 size = PyUnicode_GET_LENGTH(input);
8278 i = 0;
8279
8280 if (size == 0) {
8281 Py_INCREF(input);
8282 return input;
8283 }
8284
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008285 /* allocate enough for a simple 1:1 translation without
8286 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008287 osize = size;
8288 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8289 opos = 0;
8290 if (output == NULL) {
8291 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008292 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008293 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008294
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008295 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008296 /* try to encode it */
8297 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008298 if (charmaptranslate_output(input, i, mapping,
8299 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008300 Py_XDECREF(x);
8301 goto onError;
8302 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008303 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008304 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008305 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008306 else { /* untranslatable character */
8307 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8308 Py_ssize_t repsize;
8309 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008310 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008311 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008312 Py_ssize_t collstart = i;
8313 Py_ssize_t collend = i+1;
8314 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008315
Benjamin Peterson29060642009-01-31 22:14:21 +00008316 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008317 while (collend < size) {
8318 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008319 goto onError;
8320 Py_XDECREF(x);
8321 if (x!=Py_None)
8322 break;
8323 ++collend;
8324 }
8325 /* cache callback name lookup
8326 * (if not done yet, i.e. it's the first error) */
8327 if (known_errorHandler==-1) {
8328 if ((errors==NULL) || (!strcmp(errors, "strict")))
8329 known_errorHandler = 1;
8330 else if (!strcmp(errors, "replace"))
8331 known_errorHandler = 2;
8332 else if (!strcmp(errors, "ignore"))
8333 known_errorHandler = 3;
8334 else if (!strcmp(errors, "xmlcharrefreplace"))
8335 known_errorHandler = 4;
8336 else
8337 known_errorHandler = 0;
8338 }
8339 switch (known_errorHandler) {
8340 case 1: /* strict */
Victor Stinner6fa62752012-10-23 02:51:50 +02008341 make_translate_exception(&exc,
8342 input, collstart, collend, reason);
8343 if (exc != NULL)
8344 PyCodec_StrictErrors(exc);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008345 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008346 case 2: /* replace */
8347 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008348 for (coll = collstart; coll<collend; coll++)
8349 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008350 /* fall through */
8351 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008352 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008353 break;
8354 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008355 /* generate replacement (temporarily (mis)uses i) */
8356 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008357 char buffer[2+29+1+1];
8358 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008359 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8360 if (charmaptranslate_makespace(&output, &osize,
8361 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008362 goto onError;
8363 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008364 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008365 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008366 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008367 break;
8368 default:
8369 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008370 reason, input, &exc,
8371 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008372 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008373 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008374 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008375 Py_DECREF(repunicode);
8376 goto onError;
8377 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008378 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008379 repsize = PyUnicode_GET_LENGTH(repunicode);
8380 if (charmaptranslate_makespace(&output, &osize,
8381 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008382 Py_DECREF(repunicode);
8383 goto onError;
8384 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008385 for (uni2 = 0; repsize-->0; ++uni2)
8386 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8387 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008388 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008389 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008390 }
8391 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008392 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8393 if (!res)
8394 goto onError;
8395 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008396 Py_XDECREF(exc);
8397 Py_XDECREF(errorHandler);
8398 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008399
Benjamin Peterson29060642009-01-31 22:14:21 +00008400 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008401 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008402 Py_XDECREF(exc);
8403 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008404 return NULL;
8405}
8406
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008407/* Deprecated. Use PyUnicode_Translate instead. */
8408PyObject *
8409PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8410 Py_ssize_t size,
8411 PyObject *mapping,
8412 const char *errors)
8413{
Christian Heimes5f520f42012-09-11 14:03:25 +02008414 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008415 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8416 if (!unicode)
8417 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008418 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8419 Py_DECREF(unicode);
8420 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008421}
8422
Alexander Belopolsky40018472011-02-26 01:02:56 +00008423PyObject *
8424PyUnicode_Translate(PyObject *str,
8425 PyObject *mapping,
8426 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008427{
8428 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008429
Guido van Rossumd57fd912000-03-10 22:53:23 +00008430 str = PyUnicode_FromObject(str);
8431 if (str == NULL)
Christian Heimes5f520f42012-09-11 14:03:25 +02008432 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008433 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008434 Py_DECREF(str);
8435 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008436}
Tim Petersced69f82003-09-16 20:30:58 +00008437
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008438static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008439fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008440{
8441 /* No need to call PyUnicode_READY(self) because this function is only
8442 called as a callback from fixup() which does it already. */
8443 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8444 const int kind = PyUnicode_KIND(self);
8445 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008446 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008447 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008448 Py_ssize_t i;
8449
8450 for (i = 0; i < len; ++i) {
8451 ch = PyUnicode_READ(kind, data, i);
8452 fixed = 0;
8453 if (ch > 127) {
8454 if (Py_UNICODE_ISSPACE(ch))
8455 fixed = ' ';
8456 else {
8457 const int decimal = Py_UNICODE_TODECIMAL(ch);
8458 if (decimal >= 0)
8459 fixed = '0' + decimal;
8460 }
8461 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008462 modified = 1;
Victor Stinnere6abb482012-05-02 01:15:40 +02008463 maxchar = MAX_MAXCHAR(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008464 PyUnicode_WRITE(kind, data, i, fixed);
8465 }
Victor Stinnere6abb482012-05-02 01:15:40 +02008466 else
8467 maxchar = MAX_MAXCHAR(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008468 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008469 }
8470
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008471 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008472}
8473
8474PyObject *
8475_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8476{
8477 if (!PyUnicode_Check(unicode)) {
8478 PyErr_BadInternalCall();
8479 return NULL;
8480 }
8481 if (PyUnicode_READY(unicode) == -1)
8482 return NULL;
8483 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8484 /* If the string is already ASCII, just return the same string */
8485 Py_INCREF(unicode);
8486 return unicode;
8487 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008488 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008489}
8490
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008491PyObject *
8492PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8493 Py_ssize_t length)
8494{
Victor Stinnerf0124502011-11-21 23:12:56 +01008495 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008496 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008497 Py_UCS4 maxchar;
8498 enum PyUnicode_Kind kind;
8499 void *data;
8500
Victor Stinner99d7ad02012-02-22 13:37:39 +01008501 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008502 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008503 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008504 if (ch > 127) {
8505 int decimal = Py_UNICODE_TODECIMAL(ch);
8506 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008507 ch = '0' + decimal;
Victor Stinnere6abb482012-05-02 01:15:40 +02008508 maxchar = MAX_MAXCHAR(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008509 }
8510 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008511
8512 /* Copy to a new string */
8513 decimal = PyUnicode_New(length, maxchar);
8514 if (decimal == NULL)
8515 return decimal;
8516 kind = PyUnicode_KIND(decimal);
8517 data = PyUnicode_DATA(decimal);
8518 /* Iterate over code points */
8519 for (i = 0; i < length; i++) {
8520 Py_UNICODE ch = s[i];
8521 if (ch > 127) {
8522 int decimal = Py_UNICODE_TODECIMAL(ch);
8523 if (decimal >= 0)
8524 ch = '0' + decimal;
8525 }
8526 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008527 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008528 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008529}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008530/* --- Decimal Encoder ---------------------------------------------------- */
8531
Alexander Belopolsky40018472011-02-26 01:02:56 +00008532int
8533PyUnicode_EncodeDecimal(Py_UNICODE *s,
8534 Py_ssize_t length,
8535 char *output,
8536 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008537{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008538 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008539 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008540 enum PyUnicode_Kind kind;
8541 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008542
8543 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008544 PyErr_BadArgument();
8545 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008546 }
8547
Victor Stinner42bf7752011-11-21 22:52:58 +01008548 unicode = PyUnicode_FromUnicode(s, length);
8549 if (unicode == NULL)
8550 return -1;
8551
Benjamin Petersonbac79492012-01-14 13:34:47 -05008552 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008553 Py_DECREF(unicode);
8554 return -1;
8555 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008556 kind = PyUnicode_KIND(unicode);
8557 data = PyUnicode_DATA(unicode);
8558
Victor Stinnerb84d7232011-11-22 01:50:07 +01008559 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008560 PyObject *exc;
8561 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008562 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008563 Py_ssize_t startpos;
8564
8565 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008566
Benjamin Peterson29060642009-01-31 22:14:21 +00008567 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008568 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008569 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008570 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008571 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008572 decimal = Py_UNICODE_TODECIMAL(ch);
8573 if (decimal >= 0) {
8574 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008575 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008576 continue;
8577 }
8578 if (0 < ch && ch < 256) {
8579 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008580 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008581 continue;
8582 }
Victor Stinner6345be92011-11-25 20:09:01 +01008583
Victor Stinner42bf7752011-11-21 22:52:58 +01008584 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008585 exc = NULL;
8586 raise_encode_exception(&exc, "decimal", unicode,
8587 startpos, startpos+1,
8588 "invalid decimal Unicode string");
8589 Py_XDECREF(exc);
8590 Py_DECREF(unicode);
8591 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008592 }
8593 /* 0-terminate the output string */
8594 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008595 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008596 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008597}
8598
Guido van Rossumd57fd912000-03-10 22:53:23 +00008599/* --- Helpers ------------------------------------------------------------ */
8600
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008601static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008602any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008603 Py_ssize_t start,
8604 Py_ssize_t end)
8605{
8606 int kind1, kind2, kind;
8607 void *buf1, *buf2;
8608 Py_ssize_t len1, len2, result;
8609
8610 kind1 = PyUnicode_KIND(s1);
8611 kind2 = PyUnicode_KIND(s2);
8612 kind = kind1 > kind2 ? kind1 : kind2;
8613 buf1 = PyUnicode_DATA(s1);
8614 buf2 = PyUnicode_DATA(s2);
8615 if (kind1 != kind)
8616 buf1 = _PyUnicode_AsKind(s1, kind);
8617 if (!buf1)
8618 return -2;
8619 if (kind2 != kind)
8620 buf2 = _PyUnicode_AsKind(s2, kind);
8621 if (!buf2) {
8622 if (kind1 != kind) PyMem_Free(buf1);
8623 return -2;
8624 }
8625 len1 = PyUnicode_GET_LENGTH(s1);
8626 len2 = PyUnicode_GET_LENGTH(s2);
8627
Victor Stinner794d5672011-10-10 03:21:36 +02008628 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008629 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008630 case PyUnicode_1BYTE_KIND:
8631 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8632 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8633 else
8634 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8635 break;
8636 case PyUnicode_2BYTE_KIND:
8637 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8638 break;
8639 case PyUnicode_4BYTE_KIND:
8640 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8641 break;
8642 default:
8643 assert(0); result = -2;
8644 }
8645 }
8646 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008647 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008648 case PyUnicode_1BYTE_KIND:
8649 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8650 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8651 else
8652 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8653 break;
8654 case PyUnicode_2BYTE_KIND:
8655 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8656 break;
8657 case PyUnicode_4BYTE_KIND:
8658 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8659 break;
8660 default:
8661 assert(0); result = -2;
8662 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008663 }
8664
8665 if (kind1 != kind)
8666 PyMem_Free(buf1);
8667 if (kind2 != kind)
8668 PyMem_Free(buf2);
8669
8670 return result;
8671}
8672
8673Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01008674_PyUnicode_InsertThousandsGrouping(
8675 PyObject *unicode, Py_ssize_t index,
8676 Py_ssize_t n_buffer,
8677 void *digits, Py_ssize_t n_digits,
8678 Py_ssize_t min_width,
8679 const char *grouping, PyObject *thousands_sep,
8680 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008681{
Victor Stinner41a863c2012-02-24 00:37:51 +01008682 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008683 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01008684 Py_ssize_t thousands_sep_len;
8685 Py_ssize_t len;
8686
8687 if (unicode != NULL) {
8688 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008689 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01008690 }
8691 else {
8692 kind = PyUnicode_1BYTE_KIND;
8693 data = NULL;
8694 }
8695 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
8696 thousands_sep_data = PyUnicode_DATA(thousands_sep);
8697 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
8698 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01008699 if (thousands_sep_kind < kind) {
8700 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
8701 if (!thousands_sep_data)
8702 return -1;
8703 }
8704 else {
8705 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
8706 if (!data)
8707 return -1;
8708 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008709 }
8710
Benjamin Petersonead6b532011-12-20 17:23:42 -06008711 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008712 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008713 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01008714 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008715 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008716 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008717 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02008718 else
Victor Stinner41a863c2012-02-24 00:37:51 +01008719 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02008720 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008721 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008722 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008723 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008724 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008725 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008726 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008727 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008728 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008729 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008730 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008731 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008732 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008733 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008734 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008735 break;
8736 default:
8737 assert(0);
8738 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008739 }
Victor Stinner90f50d42012-02-24 01:44:47 +01008740 if (unicode != NULL && thousands_sep_kind != kind) {
8741 if (thousands_sep_kind < kind)
8742 PyMem_Free(thousands_sep_data);
8743 else
8744 PyMem_Free(data);
8745 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008746 if (unicode == NULL) {
8747 *maxchar = 127;
8748 if (len != n_digits) {
Victor Stinnere6abb482012-05-02 01:15:40 +02008749 *maxchar = MAX_MAXCHAR(*maxchar,
8750 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01008751 }
8752 }
8753 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008754}
8755
8756
Thomas Wouters477c8d52006-05-27 19:21:47 +00008757/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008758#define ADJUST_INDICES(start, end, len) \
8759 if (end > len) \
8760 end = len; \
8761 else if (end < 0) { \
8762 end += len; \
8763 if (end < 0) \
8764 end = 0; \
8765 } \
8766 if (start < 0) { \
8767 start += len; \
8768 if (start < 0) \
8769 start = 0; \
8770 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008771
Alexander Belopolsky40018472011-02-26 01:02:56 +00008772Py_ssize_t
8773PyUnicode_Count(PyObject *str,
8774 PyObject *substr,
8775 Py_ssize_t start,
8776 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008777{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008778 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008779 PyObject* str_obj;
8780 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008781 int kind1, kind2, kind;
8782 void *buf1 = NULL, *buf2 = NULL;
8783 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008784
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008785 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008786 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008787 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008788 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008789 if (!sub_obj) {
8790 Py_DECREF(str_obj);
8791 return -1;
8792 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06008793 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06008794 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008795 Py_DECREF(str_obj);
8796 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008797 }
Tim Petersced69f82003-09-16 20:30:58 +00008798
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008799 kind1 = PyUnicode_KIND(str_obj);
8800 kind2 = PyUnicode_KIND(sub_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008801 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008802 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008803 buf2 = PyUnicode_DATA(sub_obj);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008804 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +02008805 if (kind2 > kind) {
8806 Py_DECREF(sub_obj);
8807 Py_DECREF(str_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008808 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +02008809 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01008810 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008811 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008812 if (!buf2)
8813 goto onError;
8814 len1 = PyUnicode_GET_LENGTH(str_obj);
8815 len2 = PyUnicode_GET_LENGTH(sub_obj);
8816
8817 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06008818 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008819 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008820 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
8821 result = asciilib_count(
8822 ((Py_UCS1*)buf1) + start, end - start,
8823 buf2, len2, PY_SSIZE_T_MAX
8824 );
8825 else
8826 result = ucs1lib_count(
8827 ((Py_UCS1*)buf1) + start, end - start,
8828 buf2, len2, PY_SSIZE_T_MAX
8829 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008830 break;
8831 case PyUnicode_2BYTE_KIND:
8832 result = ucs2lib_count(
8833 ((Py_UCS2*)buf1) + start, end - start,
8834 buf2, len2, PY_SSIZE_T_MAX
8835 );
8836 break;
8837 case PyUnicode_4BYTE_KIND:
8838 result = ucs4lib_count(
8839 ((Py_UCS4*)buf1) + start, end - start,
8840 buf2, len2, PY_SSIZE_T_MAX
8841 );
8842 break;
8843 default:
8844 assert(0); result = 0;
8845 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008846
8847 Py_DECREF(sub_obj);
8848 Py_DECREF(str_obj);
8849
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008850 if (kind2 != kind)
8851 PyMem_Free(buf2);
8852
Guido van Rossumd57fd912000-03-10 22:53:23 +00008853 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008854 onError:
8855 Py_DECREF(sub_obj);
8856 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008857 if (kind2 != kind && buf2)
8858 PyMem_Free(buf2);
8859 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008860}
8861
Alexander Belopolsky40018472011-02-26 01:02:56 +00008862Py_ssize_t
8863PyUnicode_Find(PyObject *str,
8864 PyObject *sub,
8865 Py_ssize_t start,
8866 Py_ssize_t end,
8867 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008868{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008869 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008870
Guido van Rossumd57fd912000-03-10 22:53:23 +00008871 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008872 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00008873 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008874 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008875 if (!sub) {
8876 Py_DECREF(str);
8877 return -2;
8878 }
8879 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
8880 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00008881 Py_DECREF(str);
8882 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008883 }
Tim Petersced69f82003-09-16 20:30:58 +00008884
Victor Stinner794d5672011-10-10 03:21:36 +02008885 result = any_find_slice(direction,
8886 str, sub, start, end
8887 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00008888
Guido van Rossumd57fd912000-03-10 22:53:23 +00008889 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008890 Py_DECREF(sub);
8891
Guido van Rossumd57fd912000-03-10 22:53:23 +00008892 return result;
8893}
8894
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008895Py_ssize_t
8896PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8897 Py_ssize_t start, Py_ssize_t end,
8898 int direction)
8899{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008900 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02008901 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008902 if (PyUnicode_READY(str) == -1)
8903 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008904 if (start < 0 || end < 0) {
8905 PyErr_SetString(PyExc_IndexError, "string index out of range");
8906 return -2;
8907 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008908 if (end > PyUnicode_GET_LENGTH(str))
8909 end = PyUnicode_GET_LENGTH(str);
8910 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02008911 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
8912 kind, end-start, ch, direction);
8913 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008914 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02008915 else
8916 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008917}
8918
Alexander Belopolsky40018472011-02-26 01:02:56 +00008919static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008920tailmatch(PyObject *self,
8921 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008922 Py_ssize_t start,
8923 Py_ssize_t end,
8924 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008925{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008926 int kind_self;
8927 int kind_sub;
8928 void *data_self;
8929 void *data_sub;
8930 Py_ssize_t offset;
8931 Py_ssize_t i;
8932 Py_ssize_t end_sub;
8933
8934 if (PyUnicode_READY(self) == -1 ||
8935 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01008936 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008937
8938 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008939 return 1;
8940
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008941 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8942 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008943 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008944 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008945
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008946 kind_self = PyUnicode_KIND(self);
8947 data_self = PyUnicode_DATA(self);
8948 kind_sub = PyUnicode_KIND(substring);
8949 data_sub = PyUnicode_DATA(substring);
8950 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8951
8952 if (direction > 0)
8953 offset = end;
8954 else
8955 offset = start;
8956
8957 if (PyUnicode_READ(kind_self, data_self, offset) ==
8958 PyUnicode_READ(kind_sub, data_sub, 0) &&
8959 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8960 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8961 /* If both are of the same kind, memcmp is sufficient */
8962 if (kind_self == kind_sub) {
8963 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02008964 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008965 data_sub,
8966 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02008967 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008968 }
8969 /* otherwise we have to compare each character by first accesing it */
8970 else {
8971 /* We do not need to compare 0 and len(substring)-1 because
8972 the if statement above ensured already that they are equal
8973 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008974 for (i = 1; i < end_sub; ++i) {
8975 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8976 PyUnicode_READ(kind_sub, data_sub, i))
8977 return 0;
8978 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008979 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008980 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008981 }
8982
8983 return 0;
8984}
8985
Alexander Belopolsky40018472011-02-26 01:02:56 +00008986Py_ssize_t
8987PyUnicode_Tailmatch(PyObject *str,
8988 PyObject *substr,
8989 Py_ssize_t start,
8990 Py_ssize_t end,
8991 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008992{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008993 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008994
Guido van Rossumd57fd912000-03-10 22:53:23 +00008995 str = PyUnicode_FromObject(str);
8996 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008997 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008998 substr = PyUnicode_FromObject(substr);
8999 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009000 Py_DECREF(str);
9001 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009002 }
Tim Petersced69f82003-09-16 20:30:58 +00009003
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009004 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009005 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009006 Py_DECREF(str);
9007 Py_DECREF(substr);
9008 return result;
9009}
9010
Guido van Rossumd57fd912000-03-10 22:53:23 +00009011/* Apply fixfct filter to the Unicode object self and return a
9012 reference to the modified object */
9013
Alexander Belopolsky40018472011-02-26 01:02:56 +00009014static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009015fixup(PyObject *self,
9016 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009017{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009018 PyObject *u;
9019 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009020 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009021
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009022 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009023 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009024 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009025 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009026
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009027 /* fix functions return the new maximum character in a string,
9028 if the kind of the resulting unicode object does not change,
9029 everything is fine. Otherwise we need to change the string kind
9030 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009031 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009032
9033 if (maxchar_new == 0) {
9034 /* no changes */;
9035 if (PyUnicode_CheckExact(self)) {
9036 Py_DECREF(u);
9037 Py_INCREF(self);
9038 return self;
9039 }
9040 else
9041 return u;
9042 }
9043
Victor Stinnere6abb482012-05-02 01:15:40 +02009044 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009045
Victor Stinnereaab6042011-12-11 22:22:39 +01009046 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009047 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009048
9049 /* In case the maximum character changed, we need to
9050 convert the string to the new category. */
9051 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9052 if (v == NULL) {
9053 Py_DECREF(u);
9054 return NULL;
9055 }
9056 if (maxchar_new > maxchar_old) {
9057 /* If the maxchar increased so that the kind changed, not all
9058 characters are representable anymore and we need to fix the
9059 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009060 _PyUnicode_FastCopyCharacters(v, 0,
9061 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009062 maxchar_old = fixfct(v);
9063 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009064 }
9065 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009066 _PyUnicode_FastCopyCharacters(v, 0,
9067 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009068 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009069 Py_DECREF(u);
9070 assert(_PyUnicode_CheckConsistency(v, 1));
9071 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009072}
9073
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009074static PyObject *
9075ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009076{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009077 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9078 char *resdata, *data = PyUnicode_DATA(self);
9079 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009080
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009081 res = PyUnicode_New(len, 127);
9082 if (res == NULL)
9083 return NULL;
9084 resdata = PyUnicode_DATA(res);
9085 if (lower)
9086 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009087 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009088 _Py_bytes_upper(resdata, data, len);
9089 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009090}
9091
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009092static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009093handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009094{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009095 Py_ssize_t j;
9096 int final_sigma;
9097 Py_UCS4 c;
9098 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009099
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009100 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9101
9102 where ! is a negation and \p{xxx} is a character with property xxx.
9103 */
9104 for (j = i - 1; j >= 0; j--) {
9105 c = PyUnicode_READ(kind, data, j);
9106 if (!_PyUnicode_IsCaseIgnorable(c))
9107 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009108 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009109 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9110 if (final_sigma) {
9111 for (j = i + 1; j < length; j++) {
9112 c = PyUnicode_READ(kind, data, j);
9113 if (!_PyUnicode_IsCaseIgnorable(c))
9114 break;
9115 }
9116 final_sigma = j == length || !_PyUnicode_IsCased(c);
9117 }
9118 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009119}
9120
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009121static int
9122lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9123 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009124{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009125 /* Obscure special case. */
9126 if (c == 0x3A3) {
9127 mapped[0] = handle_capital_sigma(kind, data, length, i);
9128 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009129 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009130 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009131}
9132
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009133static Py_ssize_t
9134do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009135{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009136 Py_ssize_t i, k = 0;
9137 int n_res, j;
9138 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009139
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009140 c = PyUnicode_READ(kind, data, 0);
9141 n_res = _PyUnicode_ToUpperFull(c, mapped);
9142 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009143 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009144 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009145 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009146 for (i = 1; i < length; i++) {
9147 c = PyUnicode_READ(kind, data, i);
9148 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9149 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009150 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009151 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009152 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009153 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009154 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009155}
9156
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009157static Py_ssize_t
9158do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9159 Py_ssize_t i, k = 0;
9160
9161 for (i = 0; i < length; i++) {
9162 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9163 int n_res, j;
9164 if (Py_UNICODE_ISUPPER(c)) {
9165 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9166 }
9167 else if (Py_UNICODE_ISLOWER(c)) {
9168 n_res = _PyUnicode_ToUpperFull(c, mapped);
9169 }
9170 else {
9171 n_res = 1;
9172 mapped[0] = c;
9173 }
9174 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009175 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009176 res[k++] = mapped[j];
9177 }
9178 }
9179 return k;
9180}
9181
9182static Py_ssize_t
9183do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9184 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009185{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009186 Py_ssize_t i, k = 0;
9187
9188 for (i = 0; i < length; i++) {
9189 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9190 int n_res, j;
9191 if (lower)
9192 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9193 else
9194 n_res = _PyUnicode_ToUpperFull(c, mapped);
9195 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009196 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009197 res[k++] = mapped[j];
9198 }
9199 }
9200 return k;
9201}
9202
9203static Py_ssize_t
9204do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9205{
9206 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9207}
9208
9209static Py_ssize_t
9210do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9211{
9212 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9213}
9214
Benjamin Petersone51757f2012-01-12 21:10:29 -05009215static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009216do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9217{
9218 Py_ssize_t i, k = 0;
9219
9220 for (i = 0; i < length; i++) {
9221 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9222 Py_UCS4 mapped[3];
9223 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9224 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009225 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009226 res[k++] = mapped[j];
9227 }
9228 }
9229 return k;
9230}
9231
9232static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009233do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9234{
9235 Py_ssize_t i, k = 0;
9236 int previous_is_cased;
9237
9238 previous_is_cased = 0;
9239 for (i = 0; i < length; i++) {
9240 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9241 Py_UCS4 mapped[3];
9242 int n_res, j;
9243
9244 if (previous_is_cased)
9245 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9246 else
9247 n_res = _PyUnicode_ToTitleFull(c, mapped);
9248
9249 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009250 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009251 res[k++] = mapped[j];
9252 }
9253
9254 previous_is_cased = _PyUnicode_IsCased(c);
9255 }
9256 return k;
9257}
9258
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009259static PyObject *
9260case_operation(PyObject *self,
9261 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9262{
9263 PyObject *res = NULL;
9264 Py_ssize_t length, newlength = 0;
9265 int kind, outkind;
9266 void *data, *outdata;
9267 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9268
Benjamin Petersoneea48462012-01-16 14:28:50 -05009269 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009270
9271 kind = PyUnicode_KIND(self);
9272 data = PyUnicode_DATA(self);
9273 length = PyUnicode_GET_LENGTH(self);
9274 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9275 if (tmp == NULL)
9276 return PyErr_NoMemory();
9277 newlength = perform(kind, data, length, tmp, &maxchar);
9278 res = PyUnicode_New(newlength, maxchar);
9279 if (res == NULL)
9280 goto leave;
9281 tmpend = tmp + newlength;
9282 outdata = PyUnicode_DATA(res);
9283 outkind = PyUnicode_KIND(res);
9284 switch (outkind) {
9285 case PyUnicode_1BYTE_KIND:
9286 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9287 break;
9288 case PyUnicode_2BYTE_KIND:
9289 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9290 break;
9291 case PyUnicode_4BYTE_KIND:
9292 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9293 break;
9294 default:
9295 assert(0);
9296 break;
9297 }
9298 leave:
9299 PyMem_FREE(tmp);
9300 return res;
9301}
9302
Tim Peters8ce9f162004-08-27 01:49:32 +00009303PyObject *
9304PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009305{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009306 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009307 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009308 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009309 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009310 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9311 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009312 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009313 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009314 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009315 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009316 int use_memcpy;
9317 unsigned char *res_data = NULL, *sep_data = NULL;
9318 PyObject *last_obj;
9319 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009320
Tim Peters05eba1f2004-08-27 21:32:02 +00009321 fseq = PySequence_Fast(seq, "");
9322 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009323 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009324 }
9325
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009326 /* NOTE: the following code can't call back into Python code,
9327 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009328 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009329
Tim Peters05eba1f2004-08-27 21:32:02 +00009330 seqlen = PySequence_Fast_GET_SIZE(fseq);
9331 /* If empty sequence, return u"". */
9332 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009333 Py_DECREF(fseq);
Serhiy Storchaka678db842013-01-26 12:16:36 +02009334 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009335 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009336
Tim Peters05eba1f2004-08-27 21:32:02 +00009337 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009338 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009339 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009340 if (seqlen == 1) {
9341 if (PyUnicode_CheckExact(items[0])) {
9342 res = items[0];
9343 Py_INCREF(res);
9344 Py_DECREF(fseq);
9345 return res;
9346 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009347 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009348 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009349 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009350 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009351 /* Set up sep and seplen */
9352 if (separator == NULL) {
9353 /* fall back to a blank space separator */
9354 sep = PyUnicode_FromOrdinal(' ');
9355 if (!sep)
9356 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009357 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009358 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009359 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009360 else {
9361 if (!PyUnicode_Check(separator)) {
9362 PyErr_Format(PyExc_TypeError,
9363 "separator: expected str instance,"
9364 " %.80s found",
9365 Py_TYPE(separator)->tp_name);
9366 goto onError;
9367 }
9368 if (PyUnicode_READY(separator))
9369 goto onError;
9370 sep = separator;
9371 seplen = PyUnicode_GET_LENGTH(separator);
9372 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9373 /* inc refcount to keep this code path symmetric with the
9374 above case of a blank separator */
9375 Py_INCREF(sep);
9376 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009377 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009378 }
9379
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009380 /* There are at least two things to join, or else we have a subclass
9381 * of str in the sequence.
9382 * Do a pre-pass to figure out the total amount of space we'll
9383 * need (sz), and see whether all argument are strings.
9384 */
9385 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009386#ifdef Py_DEBUG
9387 use_memcpy = 0;
9388#else
9389 use_memcpy = 1;
9390#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009391 for (i = 0; i < seqlen; i++) {
9392 const Py_ssize_t old_sz = sz;
9393 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009394 if (!PyUnicode_Check(item)) {
9395 PyErr_Format(PyExc_TypeError,
9396 "sequence item %zd: expected str instance,"
9397 " %.80s found",
9398 i, Py_TYPE(item)->tp_name);
9399 goto onError;
9400 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009401 if (PyUnicode_READY(item) == -1)
9402 goto onError;
9403 sz += PyUnicode_GET_LENGTH(item);
9404 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnere6abb482012-05-02 01:15:40 +02009405 maxchar = MAX_MAXCHAR(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009406 if (i != 0)
9407 sz += seplen;
9408 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9409 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009410 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009411 goto onError;
9412 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009413 if (use_memcpy && last_obj != NULL) {
9414 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9415 use_memcpy = 0;
9416 }
9417 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009418 }
Tim Petersced69f82003-09-16 20:30:58 +00009419
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009420 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009421 if (res == NULL)
9422 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009423
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009424 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009425#ifdef Py_DEBUG
9426 use_memcpy = 0;
9427#else
9428 if (use_memcpy) {
9429 res_data = PyUnicode_1BYTE_DATA(res);
9430 kind = PyUnicode_KIND(res);
9431 if (seplen != 0)
9432 sep_data = PyUnicode_1BYTE_DATA(sep);
9433 }
9434#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009435 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009436 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009437 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009438 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009439 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009440 if (use_memcpy) {
9441 Py_MEMCPY(res_data,
9442 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009443 kind * seplen);
9444 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009445 }
9446 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009447 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009448 res_offset += seplen;
9449 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009450 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009451 itemlen = PyUnicode_GET_LENGTH(item);
9452 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009453 if (use_memcpy) {
9454 Py_MEMCPY(res_data,
9455 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009456 kind * itemlen);
9457 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009458 }
9459 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009460 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009461 res_offset += itemlen;
9462 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009463 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009464 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009465 if (use_memcpy)
9466 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009467 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009468 else
9469 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009470
Tim Peters05eba1f2004-08-27 21:32:02 +00009471 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009472 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009473 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009474 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009475
Benjamin Peterson29060642009-01-31 22:14:21 +00009476 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009477 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009478 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009479 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009480 return NULL;
9481}
9482
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009483#define FILL(kind, data, value, start, length) \
9484 do { \
9485 Py_ssize_t i_ = 0; \
9486 assert(kind != PyUnicode_WCHAR_KIND); \
9487 switch ((kind)) { \
9488 case PyUnicode_1BYTE_KIND: { \
9489 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +02009490 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009491 break; \
9492 } \
9493 case PyUnicode_2BYTE_KIND: { \
9494 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9495 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9496 break; \
9497 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009498 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009499 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9500 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9501 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009502 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009503 } \
9504 } \
9505 } while (0)
9506
Victor Stinnerd3f08822012-05-29 12:57:52 +02009507void
9508_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9509 Py_UCS4 fill_char)
9510{
9511 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9512 const void *data = PyUnicode_DATA(unicode);
9513 assert(PyUnicode_IS_READY(unicode));
9514 assert(unicode_modifiable(unicode));
9515 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9516 assert(start >= 0);
9517 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9518 FILL(kind, data, fill_char, start, length);
9519}
9520
Victor Stinner3fe55312012-01-04 00:33:50 +01009521Py_ssize_t
9522PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9523 Py_UCS4 fill_char)
9524{
9525 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +01009526
9527 if (!PyUnicode_Check(unicode)) {
9528 PyErr_BadInternalCall();
9529 return -1;
9530 }
9531 if (PyUnicode_READY(unicode) == -1)
9532 return -1;
9533 if (unicode_check_modifiable(unicode))
9534 return -1;
9535
Victor Stinnerd3f08822012-05-29 12:57:52 +02009536 if (start < 0) {
9537 PyErr_SetString(PyExc_IndexError, "string index out of range");
9538 return -1;
9539 }
Victor Stinner3fe55312012-01-04 00:33:50 +01009540 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9541 PyErr_SetString(PyExc_ValueError,
9542 "fill character is bigger than "
9543 "the string maximum character");
9544 return -1;
9545 }
9546
9547 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9548 length = Py_MIN(maxlen, length);
9549 if (length <= 0)
9550 return 0;
9551
Victor Stinnerd3f08822012-05-29 12:57:52 +02009552 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +01009553 return length;
9554}
9555
Victor Stinner9310abb2011-10-05 00:59:23 +02009556static PyObject *
9557pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009558 Py_ssize_t left,
9559 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009560 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009561{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009562 PyObject *u;
9563 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009564 int kind;
9565 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009566
9567 if (left < 0)
9568 left = 0;
9569 if (right < 0)
9570 right = 0;
9571
Victor Stinnerc4b49542011-12-11 22:44:26 +01009572 if (left == 0 && right == 0)
9573 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009574
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009575 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9576 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009577 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9578 return NULL;
9579 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009580 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02009581 maxchar = MAX_MAXCHAR(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009582 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009583 if (!u)
9584 return NULL;
9585
9586 kind = PyUnicode_KIND(u);
9587 data = PyUnicode_DATA(u);
9588 if (left)
9589 FILL(kind, data, fill, 0, left);
9590 if (right)
9591 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +02009592 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009593 assert(_PyUnicode_CheckConsistency(u, 1));
9594 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009595}
9596
Alexander Belopolsky40018472011-02-26 01:02:56 +00009597PyObject *
9598PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009599{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009600 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009601
9602 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009603 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009604 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009605 if (PyUnicode_READY(string) == -1) {
9606 Py_DECREF(string);
9607 return NULL;
9608 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009609
Benjamin Petersonead6b532011-12-20 17:23:42 -06009610 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009611 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009612 if (PyUnicode_IS_ASCII(string))
9613 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009614 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009615 PyUnicode_GET_LENGTH(string), keepends);
9616 else
9617 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009618 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009619 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009620 break;
9621 case PyUnicode_2BYTE_KIND:
9622 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009623 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009624 PyUnicode_GET_LENGTH(string), keepends);
9625 break;
9626 case PyUnicode_4BYTE_KIND:
9627 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009628 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009629 PyUnicode_GET_LENGTH(string), keepends);
9630 break;
9631 default:
9632 assert(0);
9633 list = 0;
9634 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009635 Py_DECREF(string);
9636 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009637}
9638
Alexander Belopolsky40018472011-02-26 01:02:56 +00009639static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009640split(PyObject *self,
9641 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009642 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009643{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009644 int kind1, kind2, kind;
9645 void *buf1, *buf2;
9646 Py_ssize_t len1, len2;
9647 PyObject* out;
9648
Guido van Rossumd57fd912000-03-10 22:53:23 +00009649 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009650 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009651
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009652 if (PyUnicode_READY(self) == -1)
9653 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009654
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009655 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009656 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009657 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009658 if (PyUnicode_IS_ASCII(self))
9659 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009660 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009661 PyUnicode_GET_LENGTH(self), maxcount
9662 );
9663 else
9664 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009665 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009666 PyUnicode_GET_LENGTH(self), maxcount
9667 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009668 case PyUnicode_2BYTE_KIND:
9669 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009670 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009671 PyUnicode_GET_LENGTH(self), maxcount
9672 );
9673 case PyUnicode_4BYTE_KIND:
9674 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009675 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009676 PyUnicode_GET_LENGTH(self), maxcount
9677 );
9678 default:
9679 assert(0);
9680 return NULL;
9681 }
9682
9683 if (PyUnicode_READY(substring) == -1)
9684 return NULL;
9685
9686 kind1 = PyUnicode_KIND(self);
9687 kind2 = PyUnicode_KIND(substring);
9688 kind = kind1 > kind2 ? kind1 : kind2;
9689 buf1 = PyUnicode_DATA(self);
9690 buf2 = PyUnicode_DATA(substring);
9691 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009692 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009693 if (!buf1)
9694 return NULL;
9695 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009696 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009697 if (!buf2) {
9698 if (kind1 != kind) PyMem_Free(buf1);
9699 return NULL;
9700 }
9701 len1 = PyUnicode_GET_LENGTH(self);
9702 len2 = PyUnicode_GET_LENGTH(substring);
9703
Benjamin Petersonead6b532011-12-20 17:23:42 -06009704 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009705 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009706 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9707 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009708 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009709 else
9710 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009711 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009712 break;
9713 case PyUnicode_2BYTE_KIND:
9714 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009715 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009716 break;
9717 case PyUnicode_4BYTE_KIND:
9718 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009719 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009720 break;
9721 default:
9722 out = NULL;
9723 }
9724 if (kind1 != kind)
9725 PyMem_Free(buf1);
9726 if (kind2 != kind)
9727 PyMem_Free(buf2);
9728 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009729}
9730
Alexander Belopolsky40018472011-02-26 01:02:56 +00009731static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009732rsplit(PyObject *self,
9733 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009734 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009735{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009736 int kind1, kind2, kind;
9737 void *buf1, *buf2;
9738 Py_ssize_t len1, len2;
9739 PyObject* out;
9740
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009741 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009742 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009743
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009744 if (PyUnicode_READY(self) == -1)
9745 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009746
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009747 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009748 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009749 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009750 if (PyUnicode_IS_ASCII(self))
9751 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009752 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009753 PyUnicode_GET_LENGTH(self), maxcount
9754 );
9755 else
9756 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009757 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009758 PyUnicode_GET_LENGTH(self), maxcount
9759 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009760 case PyUnicode_2BYTE_KIND:
9761 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009762 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009763 PyUnicode_GET_LENGTH(self), maxcount
9764 );
9765 case PyUnicode_4BYTE_KIND:
9766 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009767 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009768 PyUnicode_GET_LENGTH(self), maxcount
9769 );
9770 default:
9771 assert(0);
9772 return NULL;
9773 }
9774
9775 if (PyUnicode_READY(substring) == -1)
9776 return NULL;
9777
9778 kind1 = PyUnicode_KIND(self);
9779 kind2 = PyUnicode_KIND(substring);
9780 kind = kind1 > kind2 ? kind1 : kind2;
9781 buf1 = PyUnicode_DATA(self);
9782 buf2 = PyUnicode_DATA(substring);
9783 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009784 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009785 if (!buf1)
9786 return NULL;
9787 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009788 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009789 if (!buf2) {
9790 if (kind1 != kind) PyMem_Free(buf1);
9791 return NULL;
9792 }
9793 len1 = PyUnicode_GET_LENGTH(self);
9794 len2 = PyUnicode_GET_LENGTH(substring);
9795
Benjamin Petersonead6b532011-12-20 17:23:42 -06009796 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009797 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009798 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9799 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009800 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009801 else
9802 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009803 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009804 break;
9805 case PyUnicode_2BYTE_KIND:
9806 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009807 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009808 break;
9809 case PyUnicode_4BYTE_KIND:
9810 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009811 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009812 break;
9813 default:
9814 out = NULL;
9815 }
9816 if (kind1 != kind)
9817 PyMem_Free(buf1);
9818 if (kind2 != kind)
9819 PyMem_Free(buf2);
9820 return out;
9821}
9822
9823static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009824anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9825 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009826{
Benjamin Petersonead6b532011-12-20 17:23:42 -06009827 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009828 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009829 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9830 return asciilib_find(buf1, len1, buf2, len2, offset);
9831 else
9832 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009833 case PyUnicode_2BYTE_KIND:
9834 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9835 case PyUnicode_4BYTE_KIND:
9836 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9837 }
9838 assert(0);
9839 return -1;
9840}
9841
9842static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009843anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
9844 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009845{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -06009846 switch (kind) {
9847 case PyUnicode_1BYTE_KIND:
9848 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
9849 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
9850 else
9851 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
9852 case PyUnicode_2BYTE_KIND:
9853 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9854 case PyUnicode_4BYTE_KIND:
9855 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9856 }
9857 assert(0);
9858 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009859}
9860
Alexander Belopolsky40018472011-02-26 01:02:56 +00009861static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009862replace(PyObject *self, PyObject *str1,
9863 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009864{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009865 PyObject *u;
9866 char *sbuf = PyUnicode_DATA(self);
9867 char *buf1 = PyUnicode_DATA(str1);
9868 char *buf2 = PyUnicode_DATA(str2);
9869 int srelease = 0, release1 = 0, release2 = 0;
9870 int skind = PyUnicode_KIND(self);
9871 int kind1 = PyUnicode_KIND(str1);
9872 int kind2 = PyUnicode_KIND(str2);
9873 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9874 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9875 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +02009876 int mayshrink;
9877 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009878
9879 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009880 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009881 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009882 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009883
Victor Stinner59de0ee2011-10-07 10:01:28 +02009884 if (str1 == str2)
9885 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009886 if (skind < kind1)
9887 /* substring too wide to be present */
9888 goto nothing;
9889
Victor Stinner49a0a212011-10-12 23:46:10 +02009890 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9891 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
9892 /* Replacing str1 with str2 may cause a maxchar reduction in the
9893 result string. */
9894 mayshrink = (maxchar_str2 < maxchar);
Victor Stinnere6abb482012-05-02 01:15:40 +02009895 maxchar = MAX_MAXCHAR(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +02009896
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009897 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009898 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009899 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009900 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009901 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009902 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +02009903 Py_UCS4 u1, u2;
9904 int rkind;
Victor Stinnerf6441102011-12-18 02:43:08 +01009905 Py_ssize_t index, pos;
9906 char *src;
9907
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009908 u1 = PyUnicode_READ_CHAR(str1, 0);
Victor Stinnerf6441102011-12-18 02:43:08 +01009909 pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
9910 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009911 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009912 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009913 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009914 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009915 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +02009916 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009917 rkind = PyUnicode_KIND(u);
Victor Stinnerf6441102011-12-18 02:43:08 +01009918
9919 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2);
9920 index = 0;
9921 src = sbuf;
9922 while (--maxcount)
9923 {
9924 pos++;
9925 src += pos * PyUnicode_KIND(self);
9926 slen -= pos;
9927 index += pos;
9928 pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
9929 if (pos < 0)
9930 break;
9931 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2);
9932 }
Victor Stinner49a0a212011-10-12 23:46:10 +02009933 }
9934 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009935 int rkind = skind;
9936 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +01009937 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +02009938
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009939 if (kind1 < rkind) {
9940 /* widen substring */
9941 buf1 = _PyUnicode_AsKind(str1, rkind);
9942 if (!buf1) goto error;
9943 release1 = 1;
9944 }
Victor Stinnerc3cec782011-10-05 21:24:08 +02009945 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009946 if (i < 0)
9947 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009948 if (rkind > kind2) {
9949 /* widen replacement */
9950 buf2 = _PyUnicode_AsKind(str2, rkind);
9951 if (!buf2) goto error;
9952 release2 = 1;
9953 }
9954 else if (rkind < kind2) {
9955 /* widen self and buf1 */
9956 rkind = kind2;
9957 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +01009958 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009959 sbuf = _PyUnicode_AsKind(self, rkind);
9960 if (!sbuf) goto error;
9961 srelease = 1;
9962 buf1 = _PyUnicode_AsKind(str1, rkind);
9963 if (!buf1) goto error;
9964 release1 = 1;
9965 }
Victor Stinner49a0a212011-10-12 23:46:10 +02009966 u = PyUnicode_New(slen, maxchar);
9967 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009968 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +02009969 assert(PyUnicode_KIND(u) == rkind);
9970 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +02009971
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009972 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009973 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009974 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009975 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009976 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009977 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009978
9979 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +02009980 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009981 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009982 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009983 if (i == -1)
9984 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009985 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009986 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009987 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009988 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009989 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009990 }
Victor Stinner49a0a212011-10-12 23:46:10 +02009991 }
9992 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009993 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01009994 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009995 int rkind = skind;
9996 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009997
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009998 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +02009999 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010000 buf1 = _PyUnicode_AsKind(str1, rkind);
10001 if (!buf1) goto error;
10002 release1 = 1;
10003 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010004 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010005 if (n == 0)
10006 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010007 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010008 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010009 buf2 = _PyUnicode_AsKind(str2, rkind);
10010 if (!buf2) goto error;
10011 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010012 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010013 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010014 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010015 rkind = kind2;
10016 sbuf = _PyUnicode_AsKind(self, rkind);
10017 if (!sbuf) goto error;
10018 srelease = 1;
10019 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010020 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010021 buf1 = _PyUnicode_AsKind(str1, rkind);
10022 if (!buf1) goto error;
10023 release1 = 1;
10024 }
10025 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10026 PyUnicode_GET_LENGTH(str1))); */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010027 if (len2 > len1 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010028 PyErr_SetString(PyExc_OverflowError,
10029 "replace string is too long");
10030 goto error;
10031 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010032 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010033 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010034 _Py_INCREF_UNICODE_EMPTY();
10035 if (!unicode_empty)
10036 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010037 u = unicode_empty;
10038 goto done;
10039 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010040 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010041 PyErr_SetString(PyExc_OverflowError,
10042 "replace string is too long");
10043 goto error;
10044 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010045 u = PyUnicode_New(new_size, maxchar);
10046 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010047 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010048 assert(PyUnicode_KIND(u) == rkind);
10049 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010050 ires = i = 0;
10051 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010052 while (n-- > 0) {
10053 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010054 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010055 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010056 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010057 if (j == -1)
10058 break;
10059 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010060 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010061 memcpy(res + rkind * ires,
10062 sbuf + rkind * i,
10063 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010064 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010065 }
10066 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010067 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010068 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010069 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010070 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010071 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010072 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010073 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010074 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010075 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010076 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010077 memcpy(res + rkind * ires,
10078 sbuf + rkind * i,
10079 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010080 }
10081 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010082 /* interleave */
10083 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010084 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010085 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010086 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010087 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010088 if (--n <= 0)
10089 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010090 memcpy(res + rkind * ires,
10091 sbuf + rkind * i,
10092 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010093 ires++;
10094 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010095 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010096 memcpy(res + rkind * ires,
10097 sbuf + rkind * i,
10098 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010099 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010100 }
10101
10102 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010103 unicode_adjust_maxchar(&u);
10104 if (u == NULL)
10105 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010106 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010107
10108 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010109 if (srelease)
10110 PyMem_FREE(sbuf);
10111 if (release1)
10112 PyMem_FREE(buf1);
10113 if (release2)
10114 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010115 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010116 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010117
Benjamin Peterson29060642009-01-31 22:14:21 +000010118 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010119 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010120 if (srelease)
10121 PyMem_FREE(sbuf);
10122 if (release1)
10123 PyMem_FREE(buf1);
10124 if (release2)
10125 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010126 return unicode_result_unchanged(self);
10127
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010128 error:
10129 if (srelease && sbuf)
10130 PyMem_FREE(sbuf);
10131 if (release1 && buf1)
10132 PyMem_FREE(buf1);
10133 if (release2 && buf2)
10134 PyMem_FREE(buf2);
10135 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010136}
10137
10138/* --- Unicode Object Methods --------------------------------------------- */
10139
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010140PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010141 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010142\n\
10143Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010144characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010145
10146static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010147unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010148{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010149 if (PyUnicode_READY(self) == -1)
10150 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010151 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010152}
10153
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010154PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010155 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010156\n\
10157Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010158have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010159
10160static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010161unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010162{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010163 if (PyUnicode_READY(self) == -1)
10164 return NULL;
10165 if (PyUnicode_GET_LENGTH(self) == 0)
10166 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010167 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010168}
10169
Benjamin Petersond5890c82012-01-14 13:23:30 -050010170PyDoc_STRVAR(casefold__doc__,
10171 "S.casefold() -> str\n\
10172\n\
10173Return a version of S suitable for caseless comparisons.");
10174
10175static PyObject *
10176unicode_casefold(PyObject *self)
10177{
10178 if (PyUnicode_READY(self) == -1)
10179 return NULL;
10180 if (PyUnicode_IS_ASCII(self))
10181 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010182 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010183}
10184
10185
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010186/* Argument converter. Coerces to a single unicode character */
10187
10188static int
10189convert_uc(PyObject *obj, void *addr)
10190{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010191 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010192 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010193
Benjamin Peterson14339b62009-01-31 16:36:08 +000010194 uniobj = PyUnicode_FromObject(obj);
10195 if (uniobj == NULL) {
10196 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010197 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010198 return 0;
10199 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010200 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010201 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010202 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010203 Py_DECREF(uniobj);
10204 return 0;
10205 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010206 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010207 Py_DECREF(uniobj);
10208 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010209}
10210
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010211PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010212 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010213\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010214Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010215done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010216
10217static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010218unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010219{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010220 Py_ssize_t marg, left;
10221 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010222 Py_UCS4 fillchar = ' ';
10223
Victor Stinnere9a29352011-10-01 02:14:59 +020010224 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010225 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010226
Benjamin Petersonbac79492012-01-14 13:34:47 -050010227 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010228 return NULL;
10229
Victor Stinnerc4b49542011-12-11 22:44:26 +010010230 if (PyUnicode_GET_LENGTH(self) >= width)
10231 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010232
Victor Stinnerc4b49542011-12-11 22:44:26 +010010233 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010234 left = marg / 2 + (marg & width & 1);
10235
Victor Stinner9310abb2011-10-05 00:59:23 +020010236 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010237}
10238
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010239/* This function assumes that str1 and str2 are readied by the caller. */
10240
Marc-André Lemburge5034372000-08-08 08:04:29 +000010241static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010242unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010243{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010244 int kind1, kind2;
10245 void *data1, *data2;
Victor Stinner770e19e2012-10-04 22:59:45 +020010246 Py_ssize_t len1, len2;
10247 Py_ssize_t i, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010248
Victor Stinner90db9c42012-10-04 21:53:50 +020010249 /* a string is equal to itself */
10250 if (str1 == str2)
10251 return 0;
10252
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010253 kind1 = PyUnicode_KIND(str1);
10254 kind2 = PyUnicode_KIND(str2);
10255 data1 = PyUnicode_DATA(str1);
10256 data2 = PyUnicode_DATA(str2);
10257 len1 = PyUnicode_GET_LENGTH(str1);
10258 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010259 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010260
Victor Stinner770e19e2012-10-04 22:59:45 +020010261 if (kind1 == 1 && kind2 == 1) {
10262 int cmp = memcmp(data1, data2, len);
10263 /* normalize result of memcmp() into the range [-1; 1] */
10264 if (cmp < 0)
10265 return -1;
10266 if (cmp > 0)
10267 return 1;
10268 }
10269 else {
10270 for (i = 0; i < len; ++i) {
10271 Py_UCS4 c1, c2;
10272 c1 = PyUnicode_READ(kind1, data1, i);
10273 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010274
Victor Stinner770e19e2012-10-04 22:59:45 +020010275 if (c1 != c2)
10276 return (c1 < c2) ? -1 : 1;
10277 }
Marc-André Lemburge5034372000-08-08 08:04:29 +000010278 }
10279
Victor Stinner770e19e2012-10-04 22:59:45 +020010280 if (len1 == len2)
10281 return 0;
10282 if (len1 < len2)
10283 return -1;
10284 else
10285 return 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010286}
10287
Victor Stinnere5567ad2012-10-23 02:48:49 +020010288static int
10289unicode_compare_eq(PyObject *str1, PyObject *str2)
10290{
10291 int kind;
10292 void *data1, *data2;
10293 Py_ssize_t len;
10294 int cmp;
10295
10296 /* a string is equal to itself */
10297 if (str1 == str2)
10298 return 1;
10299
10300 len = PyUnicode_GET_LENGTH(str1);
10301 if (PyUnicode_GET_LENGTH(str2) != len)
10302 return 0;
10303 kind = PyUnicode_KIND(str1);
10304 if (PyUnicode_KIND(str2) != kind)
10305 return 0;
10306 data1 = PyUnicode_DATA(str1);
10307 data2 = PyUnicode_DATA(str2);
10308
10309 cmp = memcmp(data1, data2, len * kind);
10310 return (cmp == 0);
10311}
10312
10313
Alexander Belopolsky40018472011-02-26 01:02:56 +000010314int
10315PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010316{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010317 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10318 if (PyUnicode_READY(left) == -1 ||
10319 PyUnicode_READY(right) == -1)
10320 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010321 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010322 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010323 PyErr_Format(PyExc_TypeError,
10324 "Can't compare %.100s and %.100s",
10325 left->ob_type->tp_name,
10326 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010327 return -1;
10328}
10329
Martin v. Löwis5b222132007-06-10 09:51:05 +000010330int
10331PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10332{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010333 Py_ssize_t i;
10334 int kind;
10335 void *data;
10336 Py_UCS4 chr;
10337
Victor Stinner910337b2011-10-03 03:20:16 +020010338 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010339 if (PyUnicode_READY(uni) == -1)
10340 return -1;
10341 kind = PyUnicode_KIND(uni);
10342 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010343 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010344 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10345 if (chr != str[i])
10346 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010347 /* This check keeps Python strings that end in '\0' from comparing equal
10348 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010349 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010350 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010351 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010352 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010353 return 0;
10354}
10355
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010356
Benjamin Peterson29060642009-01-31 22:14:21 +000010357#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010358 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010359
Alexander Belopolsky40018472011-02-26 01:02:56 +000010360PyObject *
10361PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010362{
10363 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020010364 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010365
Victor Stinnere5567ad2012-10-23 02:48:49 +020010366 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
10367 Py_RETURN_NOTIMPLEMENTED;
10368
10369 if (PyUnicode_READY(left) == -1 ||
10370 PyUnicode_READY(right) == -1)
10371 return NULL;
10372
10373 if (op == Py_EQ || op == Py_NE) {
10374 result = unicode_compare_eq(left, right);
10375 if (op == Py_EQ)
10376 v = TEST_COND(result);
10377 else
10378 v = TEST_COND(!result);
10379 }
10380 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020010381 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010382
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010383 /* Convert the return value to a Boolean */
10384 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010385 case Py_LE:
10386 v = TEST_COND(result <= 0);
10387 break;
10388 case Py_GE:
10389 v = TEST_COND(result >= 0);
10390 break;
10391 case Py_LT:
10392 v = TEST_COND(result == -1);
10393 break;
10394 case Py_GT:
10395 v = TEST_COND(result == 1);
10396 break;
10397 default:
10398 PyErr_BadArgument();
10399 return NULL;
10400 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010401 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020010402 Py_INCREF(v);
10403 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010404}
10405
Alexander Belopolsky40018472011-02-26 01:02:56 +000010406int
10407PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010408{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010409 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010410 int kind1, kind2, kind;
10411 void *buf1, *buf2;
10412 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010413 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010414
10415 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010416 sub = PyUnicode_FromObject(element);
10417 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010418 PyErr_Format(PyExc_TypeError,
10419 "'in <string>' requires string as left operand, not %s",
10420 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010421 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010422 }
10423
Thomas Wouters477c8d52006-05-27 19:21:47 +000010424 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010425 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010426 Py_DECREF(sub);
10427 return -1;
10428 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060010429 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
10430 Py_DECREF(sub);
10431 Py_DECREF(str);
10432 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010433
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010434 kind1 = PyUnicode_KIND(str);
10435 kind2 = PyUnicode_KIND(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010436 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010437 buf1 = PyUnicode_DATA(str);
10438 buf2 = PyUnicode_DATA(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010439 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +020010440 if (kind2 > kind) {
10441 Py_DECREF(sub);
10442 Py_DECREF(str);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010443 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +020010444 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010010445 buf2 = _PyUnicode_AsKind(sub, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010446 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010447 if (!buf2) {
10448 Py_DECREF(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010449 Py_DECREF(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010450 return -1;
10451 }
10452 len1 = PyUnicode_GET_LENGTH(str);
10453 len2 = PyUnicode_GET_LENGTH(sub);
10454
Benjamin Petersonead6b532011-12-20 17:23:42 -060010455 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010456 case PyUnicode_1BYTE_KIND:
10457 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10458 break;
10459 case PyUnicode_2BYTE_KIND:
10460 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10461 break;
10462 case PyUnicode_4BYTE_KIND:
10463 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10464 break;
10465 default:
10466 result = -1;
10467 assert(0);
10468 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010469
10470 Py_DECREF(str);
10471 Py_DECREF(sub);
10472
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010473 if (kind2 != kind)
10474 PyMem_Free(buf2);
10475
Guido van Rossum403d68b2000-03-13 15:55:09 +000010476 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010477}
10478
Guido van Rossumd57fd912000-03-10 22:53:23 +000010479/* Concat to string or Unicode object giving a new Unicode object. */
10480
Alexander Belopolsky40018472011-02-26 01:02:56 +000010481PyObject *
10482PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010483{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010484 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010485 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010486 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010487
10488 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010489 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010490 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010491 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010492 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010493 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010494 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010495
10496 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010497 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010498 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010499 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010500 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010501 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010502 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010503 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010504 }
10505
Victor Stinner488fa492011-12-12 00:01:39 +010010506 u_len = PyUnicode_GET_LENGTH(u);
10507 v_len = PyUnicode_GET_LENGTH(v);
10508 if (u_len > PY_SSIZE_T_MAX - v_len) {
10509 PyErr_SetString(PyExc_OverflowError,
10510 "strings are too large to concat");
10511 goto onError;
10512 }
10513 new_len = u_len + v_len;
10514
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010515 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010516 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Victor Stinnere6abb482012-05-02 01:15:40 +020010517 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010518
Guido van Rossumd57fd912000-03-10 22:53:23 +000010519 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010010520 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010521 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010522 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010523 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
10524 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010525 Py_DECREF(u);
10526 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010527 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010528 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010529
Benjamin Peterson29060642009-01-31 22:14:21 +000010530 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010531 Py_XDECREF(u);
10532 Py_XDECREF(v);
10533 return NULL;
10534}
10535
Walter Dörwald1ab83302007-05-18 17:15:44 +000010536void
Victor Stinner23e56682011-10-03 03:54:37 +020010537PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010538{
Victor Stinner23e56682011-10-03 03:54:37 +020010539 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010010540 Py_UCS4 maxchar, maxchar2;
10541 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020010542
10543 if (p_left == NULL) {
10544 if (!PyErr_Occurred())
10545 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010546 return;
10547 }
Victor Stinner23e56682011-10-03 03:54:37 +020010548 left = *p_left;
Serhiy Storchaka6c83e732013-01-04 12:39:34 +020010549 if (right == NULL || left == NULL || !PyUnicode_Check(left)) {
Victor Stinner23e56682011-10-03 03:54:37 +020010550 if (!PyErr_Occurred())
10551 PyErr_BadInternalCall();
10552 goto error;
10553 }
10554
Benjamin Petersonbac79492012-01-14 13:34:47 -050010555 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010556 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050010557 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010558 goto error;
10559
Victor Stinner488fa492011-12-12 00:01:39 +010010560 /* Shortcuts */
10561 if (left == unicode_empty) {
10562 Py_DECREF(left);
10563 Py_INCREF(right);
10564 *p_left = right;
10565 return;
10566 }
10567 if (right == unicode_empty)
10568 return;
10569
10570 left_len = PyUnicode_GET_LENGTH(left);
10571 right_len = PyUnicode_GET_LENGTH(right);
10572 if (left_len > PY_SSIZE_T_MAX - right_len) {
10573 PyErr_SetString(PyExc_OverflowError,
10574 "strings are too large to concat");
10575 goto error;
10576 }
10577 new_len = left_len + right_len;
10578
10579 if (unicode_modifiable(left)
10580 && PyUnicode_CheckExact(right)
10581 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020010582 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10583 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010584 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010585 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010010586 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10587 {
10588 /* append inplace */
10589 if (unicode_resize(p_left, new_len) != 0) {
10590 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10591 * deallocated so it cannot be put back into
10592 * 'variable'. The MemoryError is raised when there
10593 * is no value in 'variable', which might (very
10594 * remotely) be a cause of incompatibilities.
10595 */
10596 goto error;
Victor Stinner23e56682011-10-03 03:54:37 +020010597 }
Victor Stinner488fa492011-12-12 00:01:39 +010010598 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerd3f08822012-05-29 12:57:52 +020010599 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020010600 }
Victor Stinner488fa492011-12-12 00:01:39 +010010601 else {
10602 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10603 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Victor Stinnere6abb482012-05-02 01:15:40 +020010604 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020010605
Victor Stinner488fa492011-12-12 00:01:39 +010010606 /* Concat the two Unicode strings */
10607 res = PyUnicode_New(new_len, maxchar);
10608 if (res == NULL)
10609 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010610 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
10611 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010010612 Py_DECREF(left);
10613 *p_left = res;
10614 }
10615 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010616 return;
10617
10618error:
Victor Stinner488fa492011-12-12 00:01:39 +010010619 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010620}
10621
10622void
10623PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10624{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010625 PyUnicode_Append(pleft, right);
10626 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010627}
10628
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010629PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010630 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010631\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010632Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010633string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010634interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010635
10636static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010637unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010638{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010639 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010640 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010641 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010642 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010643 int kind1, kind2, kind;
10644 void *buf1, *buf2;
10645 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010646
Jesus Ceaac451502011-04-20 17:09:23 +020010647 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10648 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010649 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010650
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010651 kind1 = PyUnicode_KIND(self);
10652 kind2 = PyUnicode_KIND(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040010653 if (kind2 > kind1)
10654 return PyLong_FromLong(0);
10655 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010656 buf1 = PyUnicode_DATA(self);
10657 buf2 = PyUnicode_DATA(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010658 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010659 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010660 if (!buf2) {
10661 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010662 return NULL;
10663 }
10664 len1 = PyUnicode_GET_LENGTH(self);
10665 len2 = PyUnicode_GET_LENGTH(substring);
10666
10667 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060010668 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010669 case PyUnicode_1BYTE_KIND:
10670 iresult = ucs1lib_count(
10671 ((Py_UCS1*)buf1) + start, end - start,
10672 buf2, len2, PY_SSIZE_T_MAX
10673 );
10674 break;
10675 case PyUnicode_2BYTE_KIND:
10676 iresult = ucs2lib_count(
10677 ((Py_UCS2*)buf1) + start, end - start,
10678 buf2, len2, PY_SSIZE_T_MAX
10679 );
10680 break;
10681 case PyUnicode_4BYTE_KIND:
10682 iresult = ucs4lib_count(
10683 ((Py_UCS4*)buf1) + start, end - start,
10684 buf2, len2, PY_SSIZE_T_MAX
10685 );
10686 break;
10687 default:
10688 assert(0); iresult = 0;
10689 }
10690
10691 result = PyLong_FromSsize_t(iresult);
10692
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010693 if (kind2 != kind)
10694 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010695
10696 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010697
Guido van Rossumd57fd912000-03-10 22:53:23 +000010698 return result;
10699}
10700
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010701PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010702 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010703\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010704Encode S using the codec registered for encoding. Default encoding\n\
10705is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010706handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010707a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10708'xmlcharrefreplace' as well as any other name registered with\n\
10709codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010710
10711static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010712unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010713{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010714 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010715 char *encoding = NULL;
10716 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010717
Benjamin Peterson308d6372009-09-18 21:42:35 +000010718 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10719 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010720 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010721 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010722}
10723
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010724PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010725 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010726\n\
10727Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010728If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010729
10730static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010731unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010732{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010733 Py_ssize_t i, j, line_pos, src_len, incr;
10734 Py_UCS4 ch;
10735 PyObject *u;
10736 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010737 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010738 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010739 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010740
10741 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010742 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010743
Antoine Pitrou22425222011-10-04 19:10:51 +020010744 if (PyUnicode_READY(self) == -1)
10745 return NULL;
10746
Thomas Wouters7e474022000-07-16 12:04:32 +000010747 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010748 src_len = PyUnicode_GET_LENGTH(self);
10749 i = j = line_pos = 0;
10750 kind = PyUnicode_KIND(self);
10751 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010752 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010753 for (; i < src_len; i++) {
10754 ch = PyUnicode_READ(kind, src_data, i);
10755 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010756 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010757 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010758 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010759 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010760 goto overflow;
10761 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010762 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010763 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010764 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010765 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010766 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010767 goto overflow;
10768 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010769 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010770 if (ch == '\n' || ch == '\r')
10771 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010772 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010773 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010010774 if (!found)
10775 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010776
Guido van Rossumd57fd912000-03-10 22:53:23 +000010777 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010778 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010779 if (!u)
10780 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010781 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010782
Antoine Pitroue71d5742011-10-04 15:55:09 +020010783 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010784
Antoine Pitroue71d5742011-10-04 15:55:09 +020010785 for (; i < src_len; i++) {
10786 ch = PyUnicode_READ(kind, src_data, i);
10787 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010788 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010789 incr = tabsize - (line_pos % tabsize);
10790 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010010791 FILL(kind, dest_data, ' ', j, incr);
10792 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010793 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010794 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010795 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010796 line_pos++;
10797 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010798 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010799 if (ch == '\n' || ch == '\r')
10800 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010801 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010802 }
10803 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010010804 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010805
Antoine Pitroue71d5742011-10-04 15:55:09 +020010806 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010807 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10808 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010809}
10810
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010811PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010812 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010813\n\
10814Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010815such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010816arguments start and end are interpreted as in slice notation.\n\
10817\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010818Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010819
10820static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010821unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010822{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010823 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010824 Py_ssize_t start;
10825 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010826 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010827
Jesus Ceaac451502011-04-20 17:09:23 +020010828 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10829 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010830 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010831
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010832 if (PyUnicode_READY(self) == -1)
10833 return NULL;
10834 if (PyUnicode_READY(substring) == -1)
10835 return NULL;
10836
Victor Stinner7931d9a2011-11-04 00:22:48 +010010837 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010838
10839 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010840
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010841 if (result == -2)
10842 return NULL;
10843
Christian Heimes217cfd12007-12-02 14:31:20 +000010844 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010845}
10846
10847static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010848unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010849{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020010850 void *data;
10851 enum PyUnicode_Kind kind;
10852 Py_UCS4 ch;
10853 PyObject *res;
10854
10855 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
10856 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010857 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020010858 }
10859 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
10860 PyErr_SetString(PyExc_IndexError, "string index out of range");
10861 return NULL;
10862 }
10863 kind = PyUnicode_KIND(self);
10864 data = PyUnicode_DATA(self);
10865 ch = PyUnicode_READ(kind, data, index);
10866 if (ch < 256)
10867 return get_latin1_char(ch);
10868
10869 res = PyUnicode_New(1, ch);
10870 if (res == NULL)
10871 return NULL;
10872 kind = PyUnicode_KIND(res);
10873 data = PyUnicode_DATA(res);
10874 PyUnicode_WRITE(kind, data, 0, ch);
10875 assert(_PyUnicode_CheckConsistency(res, 1));
10876 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010877}
10878
Guido van Rossumc2504932007-09-18 19:42:40 +000010879/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010880 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010881static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010882unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010883{
Guido van Rossumc2504932007-09-18 19:42:40 +000010884 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080010885 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000010886
Benjamin Petersonf6622c82012-04-09 14:53:07 -040010887#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050010888 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040010889#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010890 if (_PyUnicode_HASH(self) != -1)
10891 return _PyUnicode_HASH(self);
10892 if (PyUnicode_READY(self) == -1)
10893 return -1;
10894 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010010895 /*
10896 We make the hash of the empty string be 0, rather than using
10897 (prefix ^ suffix), since this slightly obfuscates the hash secret
10898 */
10899 if (len == 0) {
10900 _PyUnicode_HASH(self) = 0;
10901 return 0;
10902 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010903
10904 /* The hash function as a macro, gets expanded three times below. */
Georg Brandl2fb477c2012-02-21 00:33:36 +010010905#define HASH(P) \
10906 x ^= (Py_uhash_t) *P << 7; \
10907 while (--len >= 0) \
10908 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010909
Georg Brandl2fb477c2012-02-21 00:33:36 +010010910 x = (Py_uhash_t) _Py_HashSecret.prefix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010911 switch (PyUnicode_KIND(self)) {
10912 case PyUnicode_1BYTE_KIND: {
10913 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10914 HASH(c);
10915 break;
10916 }
10917 case PyUnicode_2BYTE_KIND: {
10918 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10919 HASH(s);
10920 break;
10921 }
10922 default: {
10923 Py_UCS4 *l;
10924 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10925 "Impossible switch case in unicode_hash");
10926 l = PyUnicode_4BYTE_DATA(self);
10927 HASH(l);
10928 break;
10929 }
10930 }
Georg Brandl2fb477c2012-02-21 00:33:36 +010010931 x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
10932 x ^= (Py_uhash_t) _Py_HashSecret.suffix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010933
Guido van Rossumc2504932007-09-18 19:42:40 +000010934 if (x == -1)
10935 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010936 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010937 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010938}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010939#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000010940
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010941PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010942 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010943\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010944Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010945
10946static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010947unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010948{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010949 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010950 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010951 Py_ssize_t start;
10952 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010953
Jesus Ceaac451502011-04-20 17:09:23 +020010954 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10955 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010956 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010957
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010958 if (PyUnicode_READY(self) == -1)
10959 return NULL;
10960 if (PyUnicode_READY(substring) == -1)
10961 return NULL;
10962
Victor Stinner7931d9a2011-11-04 00:22:48 +010010963 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010964
10965 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010966
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010967 if (result == -2)
10968 return NULL;
10969
Guido van Rossumd57fd912000-03-10 22:53:23 +000010970 if (result < 0) {
10971 PyErr_SetString(PyExc_ValueError, "substring not found");
10972 return NULL;
10973 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010974
Christian Heimes217cfd12007-12-02 14:31:20 +000010975 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010976}
10977
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010978PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010979 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010980\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010981Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010982at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010983
10984static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010985unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010986{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010987 Py_ssize_t i, length;
10988 int kind;
10989 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010990 int cased;
10991
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010992 if (PyUnicode_READY(self) == -1)
10993 return NULL;
10994 length = PyUnicode_GET_LENGTH(self);
10995 kind = PyUnicode_KIND(self);
10996 data = PyUnicode_DATA(self);
10997
Guido van Rossumd57fd912000-03-10 22:53:23 +000010998 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010999 if (length == 1)
11000 return PyBool_FromLong(
11001 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011002
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011003 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011004 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011005 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011006
Guido van Rossumd57fd912000-03-10 22:53:23 +000011007 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011008 for (i = 0; i < length; i++) {
11009 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011010
Benjamin Peterson29060642009-01-31 22:14:21 +000011011 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11012 return PyBool_FromLong(0);
11013 else if (!cased && Py_UNICODE_ISLOWER(ch))
11014 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011015 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011016 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011017}
11018
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011019PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011020 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011021\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011022Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011023at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011024
11025static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011026unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011027{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011028 Py_ssize_t i, length;
11029 int kind;
11030 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011031 int cased;
11032
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011033 if (PyUnicode_READY(self) == -1)
11034 return NULL;
11035 length = PyUnicode_GET_LENGTH(self);
11036 kind = PyUnicode_KIND(self);
11037 data = PyUnicode_DATA(self);
11038
Guido van Rossumd57fd912000-03-10 22:53:23 +000011039 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011040 if (length == 1)
11041 return PyBool_FromLong(
11042 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011043
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011044 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011045 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011046 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011047
Guido van Rossumd57fd912000-03-10 22:53:23 +000011048 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011049 for (i = 0; i < length; i++) {
11050 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011051
Benjamin Peterson29060642009-01-31 22:14:21 +000011052 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11053 return PyBool_FromLong(0);
11054 else if (!cased && Py_UNICODE_ISUPPER(ch))
11055 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011056 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011057 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011058}
11059
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011060PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011061 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011062\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011063Return True if S is a titlecased string and there is at least one\n\
11064character in S, i.e. upper- and titlecase characters may only\n\
11065follow uncased characters and lowercase characters only cased ones.\n\
11066Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011067
11068static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011069unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011070{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011071 Py_ssize_t i, length;
11072 int kind;
11073 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011074 int cased, previous_is_cased;
11075
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011076 if (PyUnicode_READY(self) == -1)
11077 return NULL;
11078 length = PyUnicode_GET_LENGTH(self);
11079 kind = PyUnicode_KIND(self);
11080 data = PyUnicode_DATA(self);
11081
Guido van Rossumd57fd912000-03-10 22:53:23 +000011082 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011083 if (length == 1) {
11084 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11085 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11086 (Py_UNICODE_ISUPPER(ch) != 0));
11087 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011088
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011089 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011090 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011091 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011092
Guido van Rossumd57fd912000-03-10 22:53:23 +000011093 cased = 0;
11094 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011095 for (i = 0; i < length; i++) {
11096 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011097
Benjamin Peterson29060642009-01-31 22:14:21 +000011098 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11099 if (previous_is_cased)
11100 return PyBool_FromLong(0);
11101 previous_is_cased = 1;
11102 cased = 1;
11103 }
11104 else if (Py_UNICODE_ISLOWER(ch)) {
11105 if (!previous_is_cased)
11106 return PyBool_FromLong(0);
11107 previous_is_cased = 1;
11108 cased = 1;
11109 }
11110 else
11111 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011112 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011113 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011114}
11115
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011116PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011117 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011118\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011119Return True if all characters in S are whitespace\n\
11120and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011121
11122static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011123unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011124{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011125 Py_ssize_t i, length;
11126 int kind;
11127 void *data;
11128
11129 if (PyUnicode_READY(self) == -1)
11130 return NULL;
11131 length = PyUnicode_GET_LENGTH(self);
11132 kind = PyUnicode_KIND(self);
11133 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011134
Guido van Rossumd57fd912000-03-10 22:53:23 +000011135 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011136 if (length == 1)
11137 return PyBool_FromLong(
11138 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011139
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011140 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011141 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011142 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011143
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011144 for (i = 0; i < length; i++) {
11145 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011146 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011147 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011148 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011149 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011150}
11151
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011152PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011153 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011154\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011155Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011156and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011157
11158static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011159unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011160{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011161 Py_ssize_t i, length;
11162 int kind;
11163 void *data;
11164
11165 if (PyUnicode_READY(self) == -1)
11166 return NULL;
11167 length = PyUnicode_GET_LENGTH(self);
11168 kind = PyUnicode_KIND(self);
11169 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011170
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011171 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011172 if (length == 1)
11173 return PyBool_FromLong(
11174 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011175
11176 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011177 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011178 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011179
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011180 for (i = 0; i < length; i++) {
11181 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011182 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011183 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011184 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011185}
11186
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011187PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011188 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011189\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011190Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011191and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011192
11193static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011194unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011195{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011196 int kind;
11197 void *data;
11198 Py_ssize_t len, i;
11199
11200 if (PyUnicode_READY(self) == -1)
11201 return NULL;
11202
11203 kind = PyUnicode_KIND(self);
11204 data = PyUnicode_DATA(self);
11205 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011206
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011207 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011208 if (len == 1) {
11209 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11210 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11211 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011212
11213 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011214 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011215 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011216
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011217 for (i = 0; i < len; i++) {
11218 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011219 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011220 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011221 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011222 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011223}
11224
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011225PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011226 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011227\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011228Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011229False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011230
11231static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011232unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011233{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011234 Py_ssize_t i, length;
11235 int kind;
11236 void *data;
11237
11238 if (PyUnicode_READY(self) == -1)
11239 return NULL;
11240 length = PyUnicode_GET_LENGTH(self);
11241 kind = PyUnicode_KIND(self);
11242 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011243
Guido van Rossumd57fd912000-03-10 22:53:23 +000011244 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011245 if (length == 1)
11246 return PyBool_FromLong(
11247 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011248
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011249 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011250 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011251 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011252
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011253 for (i = 0; i < length; i++) {
11254 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011255 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011256 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011257 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011258}
11259
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011260PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011261 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011262\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011263Return True if all characters in S are digits\n\
11264and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011265
11266static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011267unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011268{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011269 Py_ssize_t i, length;
11270 int kind;
11271 void *data;
11272
11273 if (PyUnicode_READY(self) == -1)
11274 return NULL;
11275 length = PyUnicode_GET_LENGTH(self);
11276 kind = PyUnicode_KIND(self);
11277 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011278
Guido van Rossumd57fd912000-03-10 22:53:23 +000011279 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011280 if (length == 1) {
11281 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11282 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11283 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011284
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011285 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011286 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011287 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011288
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011289 for (i = 0; i < length; i++) {
11290 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011291 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011292 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011293 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011294}
11295
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011296PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011297 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011298\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011299Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011300False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011301
11302static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011303unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011304{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011305 Py_ssize_t i, length;
11306 int kind;
11307 void *data;
11308
11309 if (PyUnicode_READY(self) == -1)
11310 return NULL;
11311 length = PyUnicode_GET_LENGTH(self);
11312 kind = PyUnicode_KIND(self);
11313 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011314
Guido van Rossumd57fd912000-03-10 22:53:23 +000011315 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011316 if (length == 1)
11317 return PyBool_FromLong(
11318 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011319
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011320 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011321 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011322 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011323
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011324 for (i = 0; i < length; i++) {
11325 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011326 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011327 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011328 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011329}
11330
Martin v. Löwis47383402007-08-15 07:32:56 +000011331int
11332PyUnicode_IsIdentifier(PyObject *self)
11333{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011334 int kind;
11335 void *data;
11336 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011337 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011338
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011339 if (PyUnicode_READY(self) == -1) {
11340 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011341 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011342 }
11343
11344 /* Special case for empty strings */
11345 if (PyUnicode_GET_LENGTH(self) == 0)
11346 return 0;
11347 kind = PyUnicode_KIND(self);
11348 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011349
11350 /* PEP 3131 says that the first character must be in
11351 XID_Start and subsequent characters in XID_Continue,
11352 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011353 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011354 letters, digits, underscore). However, given the current
11355 definition of XID_Start and XID_Continue, it is sufficient
11356 to check just for these, except that _ must be allowed
11357 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011358 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011359 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011360 return 0;
11361
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011362 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011363 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011364 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011365 return 1;
11366}
11367
11368PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011369 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011370\n\
11371Return True if S is a valid identifier according\n\
11372to the language definition.");
11373
11374static PyObject*
11375unicode_isidentifier(PyObject *self)
11376{
11377 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11378}
11379
Georg Brandl559e5d72008-06-11 18:37:52 +000011380PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011381 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011382\n\
11383Return True if all characters in S are considered\n\
11384printable in repr() or S is empty, False otherwise.");
11385
11386static PyObject*
11387unicode_isprintable(PyObject *self)
11388{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011389 Py_ssize_t i, length;
11390 int kind;
11391 void *data;
11392
11393 if (PyUnicode_READY(self) == -1)
11394 return NULL;
11395 length = PyUnicode_GET_LENGTH(self);
11396 kind = PyUnicode_KIND(self);
11397 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011398
11399 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011400 if (length == 1)
11401 return PyBool_FromLong(
11402 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011403
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011404 for (i = 0; i < length; i++) {
11405 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011406 Py_RETURN_FALSE;
11407 }
11408 }
11409 Py_RETURN_TRUE;
11410}
11411
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011412PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011413 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011414\n\
11415Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011416iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011417
11418static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011419unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011420{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011421 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011422}
11423
Martin v. Löwis18e16552006-02-15 17:27:45 +000011424static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011425unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011426{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011427 if (PyUnicode_READY(self) == -1)
11428 return -1;
11429 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011430}
11431
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011432PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011433 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011434\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011435Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011436done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011437
11438static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011439unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011440{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011441 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011442 Py_UCS4 fillchar = ' ';
11443
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011444 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011445 return NULL;
11446
Benjamin Petersonbac79492012-01-14 13:34:47 -050011447 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011448 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011449
Victor Stinnerc4b49542011-12-11 22:44:26 +010011450 if (PyUnicode_GET_LENGTH(self) >= width)
11451 return unicode_result_unchanged(self);
11452
11453 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011454}
11455
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011456PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011457 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011458\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011459Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011460
11461static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011462unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011463{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011464 if (PyUnicode_READY(self) == -1)
11465 return NULL;
11466 if (PyUnicode_IS_ASCII(self))
11467 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011468 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011469}
11470
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011471#define LEFTSTRIP 0
11472#define RIGHTSTRIP 1
11473#define BOTHSTRIP 2
11474
11475/* Arrays indexed by above */
11476static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11477
11478#define STRIPNAME(i) (stripformat[i]+3)
11479
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011480/* externally visible for str.strip(unicode) */
11481PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011482_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011483{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011484 void *data;
11485 int kind;
11486 Py_ssize_t i, j, len;
11487 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011488
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011489 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11490 return NULL;
11491
11492 kind = PyUnicode_KIND(self);
11493 data = PyUnicode_DATA(self);
11494 len = PyUnicode_GET_LENGTH(self);
11495 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11496 PyUnicode_DATA(sepobj),
11497 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011498
Benjamin Peterson14339b62009-01-31 16:36:08 +000011499 i = 0;
11500 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011501 while (i < len &&
11502 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011503 i++;
11504 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011505 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011506
Benjamin Peterson14339b62009-01-31 16:36:08 +000011507 j = len;
11508 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011509 do {
11510 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011511 } while (j >= i &&
11512 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011513 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011514 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011515
Victor Stinner7931d9a2011-11-04 00:22:48 +010011516 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011517}
11518
11519PyObject*
11520PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11521{
11522 unsigned char *data;
11523 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011524 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011525
Victor Stinnerde636f32011-10-01 03:55:54 +020011526 if (PyUnicode_READY(self) == -1)
11527 return NULL;
11528
Victor Stinner684d5fd2012-05-03 02:32:34 +020011529 length = PyUnicode_GET_LENGTH(self);
11530 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020011531
Victor Stinner684d5fd2012-05-03 02:32:34 +020011532 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011533 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011534
Victor Stinnerde636f32011-10-01 03:55:54 +020011535 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011536 PyErr_SetString(PyExc_IndexError, "string index out of range");
11537 return NULL;
11538 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020011539 if (start >= length || end < start)
11540 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020011541
Victor Stinner684d5fd2012-05-03 02:32:34 +020011542 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020011543 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020011544 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020011545 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020011546 }
11547 else {
11548 kind = PyUnicode_KIND(self);
11549 data = PyUnicode_1BYTE_DATA(self);
11550 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011551 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011552 length);
11553 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011554}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011555
11556static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011557do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011558{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011559 int kind;
11560 void *data;
11561 Py_ssize_t len, i, j;
11562
11563 if (PyUnicode_READY(self) == -1)
11564 return NULL;
11565
11566 kind = PyUnicode_KIND(self);
11567 data = PyUnicode_DATA(self);
11568 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011569
Benjamin Peterson14339b62009-01-31 16:36:08 +000011570 i = 0;
11571 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011572 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011573 i++;
11574 }
11575 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011576
Benjamin Peterson14339b62009-01-31 16:36:08 +000011577 j = len;
11578 if (striptype != LEFTSTRIP) {
11579 do {
11580 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011581 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011582 j++;
11583 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011584
Victor Stinner7931d9a2011-11-04 00:22:48 +010011585 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011586}
11587
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011588
11589static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011590do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011591{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011592 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011593
Benjamin Peterson14339b62009-01-31 16:36:08 +000011594 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11595 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011596
Benjamin Peterson14339b62009-01-31 16:36:08 +000011597 if (sep != NULL && sep != Py_None) {
11598 if (PyUnicode_Check(sep))
11599 return _PyUnicode_XStrip(self, striptype, sep);
11600 else {
11601 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011602 "%s arg must be None or str",
11603 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011604 return NULL;
11605 }
11606 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011607
Benjamin Peterson14339b62009-01-31 16:36:08 +000011608 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011609}
11610
11611
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011612PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011613 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011614\n\
11615Return a copy of the string S with leading and trailing\n\
11616whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011617If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011618
11619static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011620unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011621{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011622 if (PyTuple_GET_SIZE(args) == 0)
11623 return do_strip(self, BOTHSTRIP); /* Common case */
11624 else
11625 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011626}
11627
11628
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011629PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011630 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011631\n\
11632Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011633If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011634
11635static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011636unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011637{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011638 if (PyTuple_GET_SIZE(args) == 0)
11639 return do_strip(self, LEFTSTRIP); /* Common case */
11640 else
11641 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011642}
11643
11644
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011645PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011646 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011647\n\
11648Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011649If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011650
11651static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011652unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011653{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011654 if (PyTuple_GET_SIZE(args) == 0)
11655 return do_strip(self, RIGHTSTRIP); /* Common case */
11656 else
11657 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011658}
11659
11660
Guido van Rossumd57fd912000-03-10 22:53:23 +000011661static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011662unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011663{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011664 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011665 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011666
Serhiy Storchaka05997252013-01-26 12:14:02 +020011667 if (len < 1)
11668 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000011669
Victor Stinnerc4b49542011-12-11 22:44:26 +010011670 /* no repeat, return original string */
11671 if (len == 1)
11672 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000011673
Benjamin Petersonbac79492012-01-14 13:34:47 -050011674 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011675 return NULL;
11676
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011677 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011678 PyErr_SetString(PyExc_OverflowError,
11679 "repeated string is too long");
11680 return NULL;
11681 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011682 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011683
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011684 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011685 if (!u)
11686 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011687 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011688
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011689 if (PyUnicode_GET_LENGTH(str) == 1) {
11690 const int kind = PyUnicode_KIND(str);
11691 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010011692 if (kind == PyUnicode_1BYTE_KIND) {
11693 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011694 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010011695 }
11696 else if (kind == PyUnicode_2BYTE_KIND) {
11697 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011698 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010011699 ucs2[n] = fill_char;
11700 } else {
11701 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
11702 assert(kind == PyUnicode_4BYTE_KIND);
11703 for (n = 0; n < len; ++n)
11704 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011705 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011706 }
11707 else {
11708 /* number of characters copied this far */
11709 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011710 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011711 char *to = (char *) PyUnicode_DATA(u);
11712 Py_MEMCPY(to, PyUnicode_DATA(str),
11713 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011714 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011715 n = (done <= nchars-done) ? done : nchars-done;
11716 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011717 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011718 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011719 }
11720
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011721 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011722 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011723}
11724
Alexander Belopolsky40018472011-02-26 01:02:56 +000011725PyObject *
11726PyUnicode_Replace(PyObject *obj,
11727 PyObject *subobj,
11728 PyObject *replobj,
11729 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011730{
11731 PyObject *self;
11732 PyObject *str1;
11733 PyObject *str2;
11734 PyObject *result;
11735
11736 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011737 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011738 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011739 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011740 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011741 Py_DECREF(self);
11742 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011743 }
11744 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011745 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011746 Py_DECREF(self);
11747 Py_DECREF(str1);
11748 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011749 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060011750 if (PyUnicode_READY(self) == -1 ||
11751 PyUnicode_READY(str1) == -1 ||
11752 PyUnicode_READY(str2) == -1)
11753 result = NULL;
11754 else
11755 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011756 Py_DECREF(self);
11757 Py_DECREF(str1);
11758 Py_DECREF(str2);
11759 return result;
11760}
11761
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011762PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011763 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011764\n\
11765Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011766old replaced by new. If the optional argument count is\n\
11767given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011768
11769static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011770unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011771{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011772 PyObject *str1;
11773 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011774 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011775 PyObject *result;
11776
Martin v. Löwis18e16552006-02-15 17:27:45 +000011777 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011778 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060011779 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011780 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011781 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011782 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011783 return NULL;
11784 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011785 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011786 Py_DECREF(str1);
11787 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011788 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060011789 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
11790 result = NULL;
11791 else
11792 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011793
11794 Py_DECREF(str1);
11795 Py_DECREF(str2);
11796 return result;
11797}
11798
Alexander Belopolsky40018472011-02-26 01:02:56 +000011799static PyObject *
11800unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011801{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011802 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011803 Py_ssize_t isize;
11804 Py_ssize_t osize, squote, dquote, i, o;
11805 Py_UCS4 max, quote;
11806 int ikind, okind;
11807 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011808
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011809 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011810 return NULL;
11811
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011812 isize = PyUnicode_GET_LENGTH(unicode);
11813 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011814
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011815 /* Compute length of output, quote characters, and
11816 maximum character */
11817 osize = 2; /* quotes */
11818 max = 127;
11819 squote = dquote = 0;
11820 ikind = PyUnicode_KIND(unicode);
11821 for (i = 0; i < isize; i++) {
11822 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11823 switch (ch) {
11824 case '\'': squote++; osize++; break;
11825 case '"': dquote++; osize++; break;
11826 case '\\': case '\t': case '\r': case '\n':
11827 osize += 2; break;
11828 default:
11829 /* Fast-path ASCII */
11830 if (ch < ' ' || ch == 0x7f)
11831 osize += 4; /* \xHH */
11832 else if (ch < 0x7f)
11833 osize++;
11834 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11835 osize++;
11836 max = ch > max ? ch : max;
11837 }
11838 else if (ch < 0x100)
11839 osize += 4; /* \xHH */
11840 else if (ch < 0x10000)
11841 osize += 6; /* \uHHHH */
11842 else
11843 osize += 10; /* \uHHHHHHHH */
11844 }
11845 }
11846
11847 quote = '\'';
11848 if (squote) {
11849 if (dquote)
11850 /* Both squote and dquote present. Use squote,
11851 and escape them */
11852 osize += squote;
11853 else
11854 quote = '"';
11855 }
11856
11857 repr = PyUnicode_New(osize, max);
11858 if (repr == NULL)
11859 return NULL;
11860 okind = PyUnicode_KIND(repr);
11861 odata = PyUnicode_DATA(repr);
11862
11863 PyUnicode_WRITE(okind, odata, 0, quote);
11864 PyUnicode_WRITE(okind, odata, osize-1, quote);
11865
11866 for (i = 0, o = 1; i < isize; i++) {
11867 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011868
11869 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011870 if ((ch == quote) || (ch == '\\')) {
11871 PyUnicode_WRITE(okind, odata, o++, '\\');
11872 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011873 continue;
11874 }
11875
Benjamin Peterson29060642009-01-31 22:14:21 +000011876 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011877 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011878 PyUnicode_WRITE(okind, odata, o++, '\\');
11879 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011880 }
11881 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011882 PyUnicode_WRITE(okind, odata, o++, '\\');
11883 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011884 }
11885 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011886 PyUnicode_WRITE(okind, odata, o++, '\\');
11887 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011888 }
11889
11890 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011891 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011892 PyUnicode_WRITE(okind, odata, o++, '\\');
11893 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011894 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
11895 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011896 }
11897
Georg Brandl559e5d72008-06-11 18:37:52 +000011898 /* Copy ASCII characters as-is */
11899 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011900 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011901 }
11902
Benjamin Peterson29060642009-01-31 22:14:21 +000011903 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011904 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011905 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011906 (categories Z* and C* except ASCII space)
11907 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011908 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000011909 PyUnicode_WRITE(okind, odata, o++, '\\');
Georg Brandl559e5d72008-06-11 18:37:52 +000011910 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011911 if (ch <= 0xff) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011912 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011913 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
11914 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011915 }
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000011916 /* Map 16-bit characters to '\uxxxx' */
11917 else if (ch <= 0xffff) {
11918 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011919 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
11920 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
11921 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
11922 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011923 }
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000011924 /* Map 21-bit characters to '\U00xxxxxx' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011925 else {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000011926 PyUnicode_WRITE(okind, odata, o++, 'U');
11927 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
11928 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
11929 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
11930 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
Victor Stinnerf5cff562011-10-14 02:13:11 +020011931 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
11932 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
11933 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
11934 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011935 }
11936 }
11937 /* Copy characters as-is */
11938 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011939 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011940 }
11941 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000011942 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011943 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020011944 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000011945 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011946}
11947
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011948PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011949 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011950\n\
11951Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011952such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011953arguments start and end are interpreted as in slice notation.\n\
11954\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011955Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011956
11957static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011958unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011959{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011960 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011961 Py_ssize_t start;
11962 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011963 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011964
Jesus Ceaac451502011-04-20 17:09:23 +020011965 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11966 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011967 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011968
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011969 if (PyUnicode_READY(self) == -1)
11970 return NULL;
11971 if (PyUnicode_READY(substring) == -1)
11972 return NULL;
11973
Victor Stinner7931d9a2011-11-04 00:22:48 +010011974 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011975
11976 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011977
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011978 if (result == -2)
11979 return NULL;
11980
Christian Heimes217cfd12007-12-02 14:31:20 +000011981 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011982}
11983
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011984PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011985 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011986\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011987Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011988
11989static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011990unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011991{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011992 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011993 Py_ssize_t start;
11994 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011995 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011996
Jesus Ceaac451502011-04-20 17:09:23 +020011997 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
11998 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011999 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012000
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012001 if (PyUnicode_READY(self) == -1)
12002 return NULL;
12003 if (PyUnicode_READY(substring) == -1)
12004 return NULL;
12005
Victor Stinner7931d9a2011-11-04 00:22:48 +010012006 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012007
12008 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012009
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012010 if (result == -2)
12011 return NULL;
12012
Guido van Rossumd57fd912000-03-10 22:53:23 +000012013 if (result < 0) {
12014 PyErr_SetString(PyExc_ValueError, "substring not found");
12015 return NULL;
12016 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012017
Christian Heimes217cfd12007-12-02 14:31:20 +000012018 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012019}
12020
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012021PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012022 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012023\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012024Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012025done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012026
12027static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012028unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012029{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012030 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012031 Py_UCS4 fillchar = ' ';
12032
Victor Stinnere9a29352011-10-01 02:14:59 +020012033 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012034 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012035
Benjamin Petersonbac79492012-01-14 13:34:47 -050012036 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012037 return NULL;
12038
Victor Stinnerc4b49542011-12-11 22:44:26 +010012039 if (PyUnicode_GET_LENGTH(self) >= width)
12040 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012041
Victor Stinnerc4b49542011-12-11 22:44:26 +010012042 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012043}
12044
Alexander Belopolsky40018472011-02-26 01:02:56 +000012045PyObject *
12046PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012047{
12048 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012049
Guido van Rossumd57fd912000-03-10 22:53:23 +000012050 s = PyUnicode_FromObject(s);
12051 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012052 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012053 if (sep != NULL) {
12054 sep = PyUnicode_FromObject(sep);
12055 if (sep == NULL) {
12056 Py_DECREF(s);
12057 return NULL;
12058 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012059 }
12060
Victor Stinner9310abb2011-10-05 00:59:23 +020012061 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012062
12063 Py_DECREF(s);
12064 Py_XDECREF(sep);
12065 return result;
12066}
12067
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012068PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012069 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012070\n\
12071Return a list of the words in S, using sep as the\n\
12072delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012073splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012074whitespace string is a separator and empty strings are\n\
12075removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012076
12077static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012078unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012079{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012080 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012081 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012082 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012083
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012084 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12085 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012086 return NULL;
12087
12088 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012089 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012090 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012091 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012092 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012093 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012094}
12095
Thomas Wouters477c8d52006-05-27 19:21:47 +000012096PyObject *
12097PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12098{
12099 PyObject* str_obj;
12100 PyObject* sep_obj;
12101 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012102 int kind1, kind2, kind;
12103 void *buf1 = NULL, *buf2 = NULL;
12104 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012105
12106 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012107 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012108 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012109 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012110 if (!sep_obj) {
12111 Py_DECREF(str_obj);
12112 return NULL;
12113 }
12114 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12115 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012116 Py_DECREF(str_obj);
12117 return NULL;
12118 }
12119
Victor Stinner14f8f022011-10-05 20:58:25 +020012120 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012121 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012122 kind = Py_MAX(kind1, kind2);
12123 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012124 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012125 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012126 if (!buf1)
12127 goto onError;
12128 buf2 = PyUnicode_DATA(sep_obj);
12129 if (kind2 != kind)
12130 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12131 if (!buf2)
12132 goto onError;
12133 len1 = PyUnicode_GET_LENGTH(str_obj);
12134 len2 = PyUnicode_GET_LENGTH(sep_obj);
12135
Benjamin Petersonead6b532011-12-20 17:23:42 -060012136 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012137 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012138 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12139 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12140 else
12141 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012142 break;
12143 case PyUnicode_2BYTE_KIND:
12144 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12145 break;
12146 case PyUnicode_4BYTE_KIND:
12147 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12148 break;
12149 default:
12150 assert(0);
12151 out = 0;
12152 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012153
12154 Py_DECREF(sep_obj);
12155 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012156 if (kind1 != kind)
12157 PyMem_Free(buf1);
12158 if (kind2 != kind)
12159 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012160
12161 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012162 onError:
12163 Py_DECREF(sep_obj);
12164 Py_DECREF(str_obj);
12165 if (kind1 != kind && buf1)
12166 PyMem_Free(buf1);
12167 if (kind2 != kind && buf2)
12168 PyMem_Free(buf2);
12169 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012170}
12171
12172
12173PyObject *
12174PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12175{
12176 PyObject* str_obj;
12177 PyObject* sep_obj;
12178 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012179 int kind1, kind2, kind;
12180 void *buf1 = NULL, *buf2 = NULL;
12181 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012182
12183 str_obj = PyUnicode_FromObject(str_in);
12184 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012185 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012186 sep_obj = PyUnicode_FromObject(sep_in);
12187 if (!sep_obj) {
12188 Py_DECREF(str_obj);
12189 return NULL;
12190 }
12191
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012192 kind1 = PyUnicode_KIND(str_in);
12193 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012194 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012195 buf1 = PyUnicode_DATA(str_in);
12196 if (kind1 != kind)
12197 buf1 = _PyUnicode_AsKind(str_in, kind);
12198 if (!buf1)
12199 goto onError;
12200 buf2 = PyUnicode_DATA(sep_obj);
12201 if (kind2 != kind)
12202 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12203 if (!buf2)
12204 goto onError;
12205 len1 = PyUnicode_GET_LENGTH(str_obj);
12206 len2 = PyUnicode_GET_LENGTH(sep_obj);
12207
Benjamin Petersonead6b532011-12-20 17:23:42 -060012208 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012209 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012210 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12211 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12212 else
12213 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012214 break;
12215 case PyUnicode_2BYTE_KIND:
12216 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12217 break;
12218 case PyUnicode_4BYTE_KIND:
12219 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12220 break;
12221 default:
12222 assert(0);
12223 out = 0;
12224 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012225
12226 Py_DECREF(sep_obj);
12227 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012228 if (kind1 != kind)
12229 PyMem_Free(buf1);
12230 if (kind2 != kind)
12231 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012232
12233 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012234 onError:
12235 Py_DECREF(sep_obj);
12236 Py_DECREF(str_obj);
12237 if (kind1 != kind && buf1)
12238 PyMem_Free(buf1);
12239 if (kind2 != kind && buf2)
12240 PyMem_Free(buf2);
12241 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012242}
12243
12244PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012245 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012246\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012247Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012248the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012249found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012250
12251static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012252unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012253{
Victor Stinner9310abb2011-10-05 00:59:23 +020012254 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012255}
12256
12257PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012258 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012259\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012260Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012261the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012262separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012263
12264static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012265unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012266{
Victor Stinner9310abb2011-10-05 00:59:23 +020012267 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012268}
12269
Alexander Belopolsky40018472011-02-26 01:02:56 +000012270PyObject *
12271PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012272{
12273 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012274
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012275 s = PyUnicode_FromObject(s);
12276 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012277 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012278 if (sep != NULL) {
12279 sep = PyUnicode_FromObject(sep);
12280 if (sep == NULL) {
12281 Py_DECREF(s);
12282 return NULL;
12283 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012284 }
12285
Victor Stinner9310abb2011-10-05 00:59:23 +020012286 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012287
12288 Py_DECREF(s);
12289 Py_XDECREF(sep);
12290 return result;
12291}
12292
12293PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012294 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012295\n\
12296Return a list of the words in S, using sep as the\n\
12297delimiter string, starting at the end of the string and\n\
12298working to the front. If maxsplit is given, at most maxsplit\n\
12299splits are done. If sep is not specified, any whitespace string\n\
12300is a separator.");
12301
12302static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012303unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012304{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012305 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012306 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012307 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012308
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012309 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12310 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012311 return NULL;
12312
12313 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012314 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012315 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012316 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012317 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012318 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012319}
12320
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012321PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012322 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012323\n\
12324Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012325Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012326is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012327
12328static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012329unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012330{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012331 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012332 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012333
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012334 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12335 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012336 return NULL;
12337
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012338 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012339}
12340
12341static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012342PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012343{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012344 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012345}
12346
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012347PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012348 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012349\n\
12350Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012351and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012352
12353static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012354unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012355{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012356 if (PyUnicode_READY(self) == -1)
12357 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012358 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012359}
12360
Georg Brandlceee0772007-11-27 23:48:05 +000012361PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012362 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012363\n\
12364Return a translation table usable for str.translate().\n\
12365If there is only one argument, it must be a dictionary mapping Unicode\n\
12366ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012367Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012368If there are two arguments, they must be strings of equal length, and\n\
12369in the resulting dictionary, each character in x will be mapped to the\n\
12370character at the same position in y. If there is a third argument, it\n\
12371must be a string, whose characters will be mapped to None in the result.");
12372
12373static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012374unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012375{
12376 PyObject *x, *y = NULL, *z = NULL;
12377 PyObject *new = NULL, *key, *value;
12378 Py_ssize_t i = 0;
12379 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012380
Georg Brandlceee0772007-11-27 23:48:05 +000012381 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12382 return NULL;
12383 new = PyDict_New();
12384 if (!new)
12385 return NULL;
12386 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012387 int x_kind, y_kind, z_kind;
12388 void *x_data, *y_data, *z_data;
12389
Georg Brandlceee0772007-11-27 23:48:05 +000012390 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012391 if (!PyUnicode_Check(x)) {
12392 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12393 "be a string if there is a second argument");
12394 goto err;
12395 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012396 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012397 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12398 "arguments must have equal length");
12399 goto err;
12400 }
12401 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012402 x_kind = PyUnicode_KIND(x);
12403 y_kind = PyUnicode_KIND(y);
12404 x_data = PyUnicode_DATA(x);
12405 y_data = PyUnicode_DATA(y);
12406 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12407 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012408 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012409 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012410 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012411 if (!value) {
12412 Py_DECREF(key);
12413 goto err;
12414 }
Georg Brandlceee0772007-11-27 23:48:05 +000012415 res = PyDict_SetItem(new, key, value);
12416 Py_DECREF(key);
12417 Py_DECREF(value);
12418 if (res < 0)
12419 goto err;
12420 }
12421 /* create entries for deleting chars in z */
12422 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012423 z_kind = PyUnicode_KIND(z);
12424 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012425 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012426 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012427 if (!key)
12428 goto err;
12429 res = PyDict_SetItem(new, key, Py_None);
12430 Py_DECREF(key);
12431 if (res < 0)
12432 goto err;
12433 }
12434 }
12435 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012436 int kind;
12437 void *data;
12438
Georg Brandlceee0772007-11-27 23:48:05 +000012439 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012440 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012441 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12442 "to maketrans it must be a dict");
12443 goto err;
12444 }
12445 /* copy entries into the new dict, converting string keys to int keys */
12446 while (PyDict_Next(x, &i, &key, &value)) {
12447 if (PyUnicode_Check(key)) {
12448 /* convert string keys to integer keys */
12449 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012450 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012451 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12452 "table must be of length 1");
12453 goto err;
12454 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012455 kind = PyUnicode_KIND(key);
12456 data = PyUnicode_DATA(key);
12457 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012458 if (!newkey)
12459 goto err;
12460 res = PyDict_SetItem(new, newkey, value);
12461 Py_DECREF(newkey);
12462 if (res < 0)
12463 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012464 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012465 /* just keep integer keys */
12466 if (PyDict_SetItem(new, key, value) < 0)
12467 goto err;
12468 } else {
12469 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12470 "be strings or integers");
12471 goto err;
12472 }
12473 }
12474 }
12475 return new;
12476 err:
12477 Py_DECREF(new);
12478 return NULL;
12479}
12480
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012481PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012482 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012483\n\
12484Return a copy of the string S, where all characters have been mapped\n\
12485through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012486Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012487Unmapped characters are left untouched. Characters mapped to None\n\
12488are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012489
12490static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012491unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012492{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012493 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012494}
12495
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012496PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012497 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012498\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012499Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012500
12501static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012502unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012503{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012504 if (PyUnicode_READY(self) == -1)
12505 return NULL;
12506 if (PyUnicode_IS_ASCII(self))
12507 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012508 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012509}
12510
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012511PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012512 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012513\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012514Pad a numeric string S with zeros on the left, to fill a field\n\
12515of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012516
12517static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012518unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012519{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012520 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012521 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012522 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012523 int kind;
12524 void *data;
12525 Py_UCS4 chr;
12526
Martin v. Löwis18e16552006-02-15 17:27:45 +000012527 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012528 return NULL;
12529
Benjamin Petersonbac79492012-01-14 13:34:47 -050012530 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012531 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012532
Victor Stinnerc4b49542011-12-11 22:44:26 +010012533 if (PyUnicode_GET_LENGTH(self) >= width)
12534 return unicode_result_unchanged(self);
12535
12536 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012537
12538 u = pad(self, fill, 0, '0');
12539
Walter Dörwald068325e2002-04-15 13:36:47 +000012540 if (u == NULL)
12541 return NULL;
12542
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012543 kind = PyUnicode_KIND(u);
12544 data = PyUnicode_DATA(u);
12545 chr = PyUnicode_READ(kind, data, fill);
12546
12547 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012548 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012549 PyUnicode_WRITE(kind, data, 0, chr);
12550 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012551 }
12552
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012553 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012554 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012555}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012556
12557#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012558static PyObject *
12559unicode__decimal2ascii(PyObject *self)
12560{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012561 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012562}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012563#endif
12564
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012565PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012566 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012567\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012568Return True if S starts with the specified prefix, False otherwise.\n\
12569With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012570With optional end, stop comparing S at that position.\n\
12571prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012572
12573static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012574unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012575 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012576{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012577 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012578 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012579 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012580 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012581 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012582
Jesus Ceaac451502011-04-20 17:09:23 +020012583 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012584 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012585 if (PyTuple_Check(subobj)) {
12586 Py_ssize_t i;
12587 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012588 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012589 if (substring == NULL)
12590 return NULL;
12591 result = tailmatch(self, substring, start, end, -1);
12592 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010012593 if (result == -1)
12594 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012595 if (result) {
12596 Py_RETURN_TRUE;
12597 }
12598 }
12599 /* nothing matched */
12600 Py_RETURN_FALSE;
12601 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012602 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012603 if (substring == NULL) {
12604 if (PyErr_ExceptionMatches(PyExc_TypeError))
12605 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12606 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012607 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012608 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012609 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012610 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010012611 if (result == -1)
12612 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012613 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012614}
12615
12616
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012617PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012618 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012619\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012620Return True if S ends with the specified suffix, False otherwise.\n\
12621With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012622With optional end, stop comparing S at that position.\n\
12623suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012624
12625static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012626unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012627 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012628{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012629 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012630 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012631 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012632 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012633 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012634
Jesus Ceaac451502011-04-20 17:09:23 +020012635 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012636 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012637 if (PyTuple_Check(subobj)) {
12638 Py_ssize_t i;
12639 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012640 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012641 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012642 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012643 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012644 result = tailmatch(self, substring, start, end, +1);
12645 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010012646 if (result == -1)
12647 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012648 if (result) {
12649 Py_RETURN_TRUE;
12650 }
12651 }
12652 Py_RETURN_FALSE;
12653 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012654 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012655 if (substring == NULL) {
12656 if (PyErr_ExceptionMatches(PyExc_TypeError))
12657 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12658 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012659 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012660 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012661 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010012662 if (result == -1)
12663 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012664 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012665 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012666}
12667
Victor Stinner202fdca2012-05-07 12:47:02 +020012668Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012669_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012670{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012671 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012672 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
12673 writer->data = PyUnicode_DATA(writer->buffer);
12674 writer->kind = PyUnicode_KIND(writer->buffer);
12675}
12676
Victor Stinnerd3f08822012-05-29 12:57:52 +020012677void
12678_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length)
Victor Stinner202fdca2012-05-07 12:47:02 +020012679{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012680 memset(writer, 0, sizeof(*writer));
12681#ifdef Py_DEBUG
12682 writer->kind = 5; /* invalid kind */
12683#endif
12684 writer->min_length = Py_MAX(min_length, 100);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012685 writer->overallocate = (min_length > 0);
Victor Stinner202fdca2012-05-07 12:47:02 +020012686}
12687
Victor Stinnerd3f08822012-05-29 12:57:52 +020012688int
12689_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
12690 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020012691{
12692 Py_ssize_t newlen;
12693 PyObject *newbuffer;
12694
Victor Stinnerd3f08822012-05-29 12:57:52 +020012695 assert(length > 0);
12696
Victor Stinner202fdca2012-05-07 12:47:02 +020012697 if (length > PY_SSIZE_T_MAX - writer->pos) {
12698 PyErr_NoMemory();
12699 return -1;
12700 }
12701 newlen = writer->pos + length;
12702
Victor Stinnerd3f08822012-05-29 12:57:52 +020012703 if (writer->buffer == NULL) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012704 if (writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012705 /* overallocate 25% to limit the number of resize */
12706 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12707 newlen += newlen / 4;
12708 if (newlen < writer->min_length)
12709 newlen = writer->min_length;
12710 }
12711 writer->buffer = PyUnicode_New(newlen, maxchar);
12712 if (writer->buffer == NULL)
12713 return -1;
12714 _PyUnicodeWriter_Update(writer);
12715 return 0;
12716 }
Victor Stinner202fdca2012-05-07 12:47:02 +020012717
Victor Stinnerd3f08822012-05-29 12:57:52 +020012718 if (newlen > writer->size) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012719 if (writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012720 /* overallocate 25% to limit the number of resize */
12721 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12722 newlen += newlen / 4;
12723 if (newlen < writer->min_length)
12724 newlen = writer->min_length;
12725 }
12726
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012727 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020012728 /* resize + widen */
12729 newbuffer = PyUnicode_New(newlen, maxchar);
12730 if (newbuffer == NULL)
12731 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012732 _PyUnicode_FastCopyCharacters(newbuffer, 0,
12733 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020012734 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012735 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020012736 }
12737 else {
12738 newbuffer = resize_compact(writer->buffer, newlen);
12739 if (newbuffer == NULL)
12740 return -1;
12741 }
12742 writer->buffer = newbuffer;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012743 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012744 }
12745 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012746 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012747 newbuffer = PyUnicode_New(writer->size, maxchar);
12748 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020012749 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012750 _PyUnicode_FastCopyCharacters(newbuffer, 0,
12751 writer->buffer, 0, writer->pos);
12752 Py_DECREF(writer->buffer);
12753 writer->buffer = newbuffer;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012754 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012755 }
12756 return 0;
12757}
12758
Victor Stinnerd3f08822012-05-29 12:57:52 +020012759int
12760_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
12761{
12762 Py_UCS4 maxchar;
12763 Py_ssize_t len;
12764
12765 if (PyUnicode_READY(str) == -1)
12766 return -1;
12767 len = PyUnicode_GET_LENGTH(str);
12768 if (len == 0)
12769 return 0;
12770 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
12771 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012772 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012773 Py_INCREF(str);
12774 writer->buffer = str;
12775 _PyUnicodeWriter_Update(writer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012776 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012777 writer->size = 0;
12778 writer->pos += len;
12779 return 0;
12780 }
12781 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
12782 return -1;
12783 }
12784 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
12785 str, 0, len);
12786 writer->pos += len;
12787 return 0;
12788}
12789
Victor Stinnere215d962012-10-06 23:03:36 +020012790int
12791_PyUnicodeWriter_WriteCstr(_PyUnicodeWriter *writer, const char *str, Py_ssize_t len)
12792{
12793 Py_UCS4 maxchar;
12794
12795 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
12796 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
12797 return -1;
12798 unicode_write_cstr(writer->buffer, writer->pos, str, len);
12799 writer->pos += len;
12800 return 0;
12801}
12802
Victor Stinnerd3f08822012-05-29 12:57:52 +020012803PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012804_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012805{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012806 if (writer->pos == 0) {
12807 Py_XDECREF(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020012808 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020012809 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012810 if (writer->readonly) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012811 assert(PyUnicode_GET_LENGTH(writer->buffer) == writer->pos);
12812 return writer->buffer;
12813 }
12814 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
12815 PyObject *newbuffer;
12816 newbuffer = resize_compact(writer->buffer, writer->pos);
12817 if (newbuffer == NULL) {
12818 Py_DECREF(writer->buffer);
12819 return NULL;
12820 }
12821 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020012822 }
Victor Stinnerf59c28c2012-05-09 03:24:14 +020012823 assert(_PyUnicode_CheckConsistency(writer->buffer, 1));
Victor Stinner202fdca2012-05-07 12:47:02 +020012824 return writer->buffer;
12825}
12826
Victor Stinnerd3f08822012-05-29 12:57:52 +020012827void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012828_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012829{
12830 Py_CLEAR(writer->buffer);
12831}
12832
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012833#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012834
12835PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012836 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012837\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012838Return a formatted version of S, using substitutions from args and kwargs.\n\
12839The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012840
Eric Smith27bbca62010-11-04 17:06:58 +000012841PyDoc_STRVAR(format_map__doc__,
12842 "S.format_map(mapping) -> str\n\
12843\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012844Return a formatted version of S, using substitutions from mapping.\n\
12845The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012846
Eric Smith4a7d76d2008-05-30 18:10:19 +000012847static PyObject *
12848unicode__format__(PyObject* self, PyObject* args)
12849{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012850 PyObject *format_spec;
12851 _PyUnicodeWriter writer;
12852 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012853
12854 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12855 return NULL;
12856
Victor Stinnerd3f08822012-05-29 12:57:52 +020012857 if (PyUnicode_READY(self) == -1)
12858 return NULL;
12859 _PyUnicodeWriter_Init(&writer, 0);
12860 ret = _PyUnicode_FormatAdvancedWriter(&writer,
12861 self, format_spec, 0,
12862 PyUnicode_GET_LENGTH(format_spec));
12863 if (ret == -1) {
12864 _PyUnicodeWriter_Dealloc(&writer);
12865 return NULL;
12866 }
12867 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000012868}
12869
Eric Smith8c663262007-08-25 02:26:07 +000012870PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012871 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012872\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012873Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012874
12875static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012876unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012877{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012878 Py_ssize_t size;
12879
12880 /* If it's a compact object, account for base structure +
12881 character data. */
12882 if (PyUnicode_IS_COMPACT_ASCII(v))
12883 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12884 else if (PyUnicode_IS_COMPACT(v))
12885 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012886 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012887 else {
12888 /* If it is a two-block object, account for base object, and
12889 for character block if present. */
12890 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012891 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012892 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012893 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012894 }
12895 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012896 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012897 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012898 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012899 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012900 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012901
12902 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012903}
12904
12905PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012906 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012907
12908static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012909unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012910{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010012911 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012912 if (!copy)
12913 return NULL;
12914 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012915}
12916
Guido van Rossumd57fd912000-03-10 22:53:23 +000012917static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012918 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012919 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012920 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
12921 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012922 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12923 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050012924 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012925 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12926 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12927 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12928 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12929 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012930 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012931 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12932 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12933 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012934 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012935 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12936 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12937 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012938 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012939 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012940 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012941 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012942 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12943 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12944 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12945 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12946 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12947 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12948 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12949 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12950 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12951 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12952 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12953 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12954 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12955 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012956 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012957 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012958 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012959 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012960 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012961 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012962 {"maketrans", (PyCFunction) unicode_maketrans,
12963 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012964 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012965#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012966 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012967 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012968#endif
12969
Benjamin Peterson14339b62009-01-31 16:36:08 +000012970 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012971 {NULL, NULL}
12972};
12973
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012974static PyObject *
12975unicode_mod(PyObject *v, PyObject *w)
12976{
Brian Curtindfc80e32011-08-10 20:28:54 -050012977 if (!PyUnicode_Check(v))
12978 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012979 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012980}
12981
12982static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012983 0, /*nb_add*/
12984 0, /*nb_subtract*/
12985 0, /*nb_multiply*/
12986 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012987};
12988
Guido van Rossumd57fd912000-03-10 22:53:23 +000012989static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012990 (lenfunc) unicode_length, /* sq_length */
12991 PyUnicode_Concat, /* sq_concat */
12992 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12993 (ssizeargfunc) unicode_getitem, /* sq_item */
12994 0, /* sq_slice */
12995 0, /* sq_ass_item */
12996 0, /* sq_ass_slice */
12997 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012998};
12999
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013000static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013001unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013002{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013003 if (PyUnicode_READY(self) == -1)
13004 return NULL;
13005
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013006 if (PyIndex_Check(item)) {
13007 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013008 if (i == -1 && PyErr_Occurred())
13009 return NULL;
13010 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013011 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013012 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013013 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013014 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013015 PyObject *result;
13016 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013017 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013018 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013019
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013020 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013021 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013022 return NULL;
13023 }
13024
13025 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013026 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013027 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013028 slicelength == PyUnicode_GET_LENGTH(self)) {
13029 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013030 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013031 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013032 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013033 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013034 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013035 src_kind = PyUnicode_KIND(self);
13036 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013037 if (!PyUnicode_IS_ASCII(self)) {
13038 kind_limit = kind_maxchar_limit(src_kind);
13039 max_char = 0;
13040 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13041 ch = PyUnicode_READ(src_kind, src_data, cur);
13042 if (ch > max_char) {
13043 max_char = ch;
13044 if (max_char >= kind_limit)
13045 break;
13046 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013047 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013048 }
Victor Stinner55c99112011-10-13 01:17:06 +020013049 else
13050 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013051 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013052 if (result == NULL)
13053 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013054 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013055 dest_data = PyUnicode_DATA(result);
13056
13057 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013058 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13059 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013060 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013061 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013062 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013063 } else {
13064 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13065 return NULL;
13066 }
13067}
13068
13069static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013070 (lenfunc)unicode_length, /* mp_length */
13071 (binaryfunc)unicode_subscript, /* mp_subscript */
13072 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013073};
13074
Guido van Rossumd57fd912000-03-10 22:53:23 +000013075
Guido van Rossumd57fd912000-03-10 22:53:23 +000013076/* Helpers for PyUnicode_Format() */
13077
Victor Stinnera47082312012-10-04 02:19:54 +020013078struct unicode_formatter_t {
13079 PyObject *args;
13080 int args_owned;
13081 Py_ssize_t arglen, argidx;
13082 PyObject *dict;
13083
13084 enum PyUnicode_Kind fmtkind;
13085 Py_ssize_t fmtcnt, fmtpos;
13086 void *fmtdata;
13087 PyObject *fmtstr;
13088
13089 _PyUnicodeWriter writer;
13090};
13091
13092struct unicode_format_arg_t {
13093 Py_UCS4 ch;
13094 int flags;
13095 Py_ssize_t width;
13096 int prec;
13097 int sign;
13098};
13099
Guido van Rossumd57fd912000-03-10 22:53:23 +000013100static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020013101unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013102{
Victor Stinnera47082312012-10-04 02:19:54 +020013103 Py_ssize_t argidx = ctx->argidx;
13104
13105 if (argidx < ctx->arglen) {
13106 ctx->argidx++;
13107 if (ctx->arglen < 0)
13108 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000013109 else
Victor Stinnera47082312012-10-04 02:19:54 +020013110 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013111 }
13112 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013113 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013114 return NULL;
13115}
13116
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013117/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013118
Victor Stinnera47082312012-10-04 02:19:54 +020013119/* Format a float into the writer if the writer is not NULL, or into *p_output
13120 otherwise.
13121
13122 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020013123static int
Victor Stinnera47082312012-10-04 02:19:54 +020013124formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13125 PyObject **p_output,
13126 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013127{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013128 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013129 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013130 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020013131 int prec;
13132 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000013133
Guido van Rossumd57fd912000-03-10 22:53:23 +000013134 x = PyFloat_AsDouble(v);
13135 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013136 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013137
Victor Stinnera47082312012-10-04 02:19:54 +020013138 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013139 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013140 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013141
Victor Stinnera47082312012-10-04 02:19:54 +020013142 if (arg->flags & F_ALT)
13143 dtoa_flags = Py_DTSF_ALT;
13144 else
13145 dtoa_flags = 0;
13146 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013147 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013148 return -1;
13149 len = strlen(p);
13150 if (writer) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013151 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1) {
13152 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013153 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013154 }
Victor Stinner184252a2012-06-16 02:57:41 +020013155 unicode_write_cstr(writer->buffer, writer->pos, p, len);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013156 writer->pos += len;
13157 }
13158 else
13159 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013160 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013161 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013162}
13163
Victor Stinnerd0880d52012-04-27 23:40:13 +020013164/* formatlong() emulates the format codes d, u, o, x and X, and
13165 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13166 * Python's regular ints.
13167 * Return value: a new PyUnicodeObject*, or NULL if error.
13168 * The output string is of the form
13169 * "-"? ("0x" | "0X")? digit+
13170 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13171 * set in flags. The case of hex digits will be correct,
13172 * There will be at least prec digits, zero-filled on the left if
13173 * necessary to get that many.
13174 * val object to be converted
13175 * flags bitmask of format flags; only F_ALT is looked at
13176 * prec minimum number of digits; 0-fill on left if needed
13177 * type a character in [duoxX]; u acts the same as d
13178 *
13179 * CAUTION: o, x and X conversions on regular ints can never
13180 * produce a '-' sign, but can for Python's unbounded ints.
13181 */
Tim Peters38fd5b62000-09-21 05:43:11 +000013182static PyObject*
Victor Stinnera47082312012-10-04 02:19:54 +020013183formatlong(PyObject *val, struct unicode_format_arg_t *arg)
Tim Peters38fd5b62000-09-21 05:43:11 +000013184{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013185 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013186 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013187 Py_ssize_t i;
13188 int sign; /* 1 if '-', else 0 */
13189 int len; /* number of characters */
13190 Py_ssize_t llen;
13191 int numdigits; /* len == numnondigits + numdigits */
13192 int numnondigits = 0;
Victor Stinnera47082312012-10-04 02:19:54 +020013193 int prec = arg->prec;
13194 int type = arg->ch;
Tim Peters38fd5b62000-09-21 05:43:11 +000013195
Victor Stinnerd0880d52012-04-27 23:40:13 +020013196 /* Avoid exceeding SSIZE_T_MAX */
13197 if (prec > INT_MAX-3) {
13198 PyErr_SetString(PyExc_OverflowError,
13199 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013200 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013201 }
13202
13203 assert(PyLong_Check(val));
13204
13205 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020013206 default:
13207 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013208 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020013209 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020013210 case 'u':
13211 /* Special-case boolean: we want 0/1 */
Victor Stinnerb11d91d2012-04-28 00:25:34 +020013212 if (PyBool_Check(val))
13213 result = PyNumber_ToBase(val, 10);
13214 else
13215 result = Py_TYPE(val)->tp_str(val);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013216 break;
13217 case 'o':
13218 numnondigits = 2;
13219 result = PyNumber_ToBase(val, 8);
13220 break;
13221 case 'x':
13222 case 'X':
13223 numnondigits = 2;
13224 result = PyNumber_ToBase(val, 16);
13225 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013226 }
13227 if (!result)
13228 return NULL;
13229
13230 assert(unicode_modifiable(result));
13231 assert(PyUnicode_IS_READY(result));
13232 assert(PyUnicode_IS_ASCII(result));
13233
13234 /* To modify the string in-place, there can only be one reference. */
13235 if (Py_REFCNT(result) != 1) {
13236 PyErr_BadInternalCall();
13237 return NULL;
13238 }
13239 buf = PyUnicode_DATA(result);
13240 llen = PyUnicode_GET_LENGTH(result);
13241 if (llen > INT_MAX) {
13242 PyErr_SetString(PyExc_ValueError,
13243 "string too large in _PyBytes_FormatLong");
13244 return NULL;
13245 }
13246 len = (int)llen;
13247 sign = buf[0] == '-';
13248 numnondigits += sign;
13249 numdigits = len - numnondigits;
13250 assert(numdigits > 0);
13251
13252 /* Get rid of base marker unless F_ALT */
Victor Stinnera47082312012-10-04 02:19:54 +020013253 if (((arg->flags & F_ALT) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020013254 (type == 'o' || type == 'x' || type == 'X'))) {
13255 assert(buf[sign] == '0');
13256 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13257 buf[sign+1] == 'o');
13258 numnondigits -= 2;
13259 buf += 2;
13260 len -= 2;
13261 if (sign)
13262 buf[0] = '-';
13263 assert(len == numnondigits + numdigits);
13264 assert(numdigits > 0);
13265 }
13266
13267 /* Fill with leading zeroes to meet minimum width. */
13268 if (prec > numdigits) {
13269 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13270 numnondigits + prec);
13271 char *b1;
13272 if (!r1) {
13273 Py_DECREF(result);
13274 return NULL;
13275 }
13276 b1 = PyBytes_AS_STRING(r1);
13277 for (i = 0; i < numnondigits; ++i)
13278 *b1++ = *buf++;
13279 for (i = 0; i < prec - numdigits; i++)
13280 *b1++ = '0';
13281 for (i = 0; i < numdigits; i++)
13282 *b1++ = *buf++;
13283 *b1 = '\0';
13284 Py_DECREF(result);
13285 result = r1;
13286 buf = PyBytes_AS_STRING(result);
13287 len = numnondigits + prec;
13288 }
13289
13290 /* Fix up case for hex conversions. */
13291 if (type == 'X') {
13292 /* Need to convert all lower case letters to upper case.
13293 and need to convert 0x to 0X (and -0x to -0X). */
13294 for (i = 0; i < len; i++)
13295 if (buf[i] >= 'a' && buf[i] <= 'x')
13296 buf[i] -= 'a'-'A';
13297 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013298 if (!PyUnicode_Check(result)
13299 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020013300 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013301 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013302 Py_DECREF(result);
13303 result = unicode;
13304 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013305 else if (len != PyUnicode_GET_LENGTH(result)) {
13306 if (PyUnicode_Resize(&result, len) < 0)
13307 Py_CLEAR(result);
13308 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013309 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013310}
13311
Victor Stinner621ef3d2012-10-02 00:33:47 +020013312/* Format an integer.
13313 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020013314 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020013315 * -1 and raise an exception on error */
13316static int
Victor Stinnera47082312012-10-04 02:19:54 +020013317mainformatlong(PyObject *v,
13318 struct unicode_format_arg_t *arg,
13319 PyObject **p_output,
13320 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020013321{
13322 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020013323 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013324
13325 if (!PyNumber_Check(v))
13326 goto wrongtype;
13327
13328 if (!PyLong_Check(v)) {
13329 iobj = PyNumber_Long(v);
13330 if (iobj == NULL) {
13331 if (PyErr_ExceptionMatches(PyExc_TypeError))
13332 goto wrongtype;
13333 return -1;
13334 }
13335 assert(PyLong_Check(iobj));
13336 }
13337 else {
13338 iobj = v;
13339 Py_INCREF(iobj);
13340 }
13341
13342 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020013343 && arg->width == -1 && arg->prec == -1
13344 && !(arg->flags & (F_SIGN | F_BLANK))
13345 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020013346 {
13347 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020013348 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013349 int base;
13350
Victor Stinnera47082312012-10-04 02:19:54 +020013351 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020013352 {
13353 default:
13354 assert(0 && "'type' not in [diuoxX]");
13355 case 'd':
13356 case 'i':
13357 case 'u':
13358 base = 10;
13359 break;
13360 case 'o':
13361 base = 8;
13362 break;
13363 case 'x':
13364 case 'X':
13365 base = 16;
13366 break;
13367 }
13368
Victor Stinnerc89d28f2012-10-02 12:54:07 +020013369 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
13370 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013371 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020013372 }
13373 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013374 return 1;
13375 }
13376
Victor Stinnera47082312012-10-04 02:19:54 +020013377 res = formatlong(iobj, arg);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013378 Py_DECREF(iobj);
13379 if (res == NULL)
13380 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020013381 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013382 return 0;
13383
13384wrongtype:
13385 PyErr_Format(PyExc_TypeError,
13386 "%%%c format: a number is required, "
Victor Stinnera47082312012-10-04 02:19:54 +020013387 "not %.200s",
13388 type, Py_TYPE(v)->tp_name);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013389 return -1;
13390}
13391
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013392static Py_UCS4
13393formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013394{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013395 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013396 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013397 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013398 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013399 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013400 goto onError;
13401 }
13402 else {
13403 /* Integer input truncated to a character */
13404 long x;
13405 x = PyLong_AsLong(v);
13406 if (x == -1 && PyErr_Occurred())
13407 goto onError;
13408
Victor Stinner8faf8212011-12-08 22:14:11 +010013409 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013410 PyErr_SetString(PyExc_OverflowError,
13411 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013412 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013413 }
13414
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013415 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013416 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013417
Benjamin Peterson29060642009-01-31 22:14:21 +000013418 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013419 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013420 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013421 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013422}
13423
Victor Stinnera47082312012-10-04 02:19:54 +020013424/* Parse options of an argument: flags, width, precision.
13425 Handle also "%(name)" syntax.
13426
13427 Return 0 if the argument has been formatted into arg->str.
13428 Return 1 if the argument has been written into ctx->writer,
13429 Raise an exception and return -1 on error. */
13430static int
13431unicode_format_arg_parse(struct unicode_formatter_t *ctx,
13432 struct unicode_format_arg_t *arg)
13433{
13434#define FORMAT_READ(ctx) \
13435 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
13436
13437 PyObject *v;
13438
Victor Stinnera47082312012-10-04 02:19:54 +020013439 if (arg->ch == '(') {
13440 /* Get argument value from a dictionary. Example: "%(name)s". */
13441 Py_ssize_t keystart;
13442 Py_ssize_t keylen;
13443 PyObject *key;
13444 int pcount = 1;
13445
13446 if (ctx->dict == NULL) {
13447 PyErr_SetString(PyExc_TypeError,
13448 "format requires a mapping");
13449 return -1;
13450 }
13451 ++ctx->fmtpos;
13452 --ctx->fmtcnt;
13453 keystart = ctx->fmtpos;
13454 /* Skip over balanced parentheses */
13455 while (pcount > 0 && --ctx->fmtcnt >= 0) {
13456 arg->ch = FORMAT_READ(ctx);
13457 if (arg->ch == ')')
13458 --pcount;
13459 else if (arg->ch == '(')
13460 ++pcount;
13461 ctx->fmtpos++;
13462 }
13463 keylen = ctx->fmtpos - keystart - 1;
13464 if (ctx->fmtcnt < 0 || pcount > 0) {
13465 PyErr_SetString(PyExc_ValueError,
13466 "incomplete format key");
13467 return -1;
13468 }
13469 key = PyUnicode_Substring(ctx->fmtstr,
13470 keystart, keystart + keylen);
13471 if (key == NULL)
13472 return -1;
13473 if (ctx->args_owned) {
13474 Py_DECREF(ctx->args);
13475 ctx->args_owned = 0;
13476 }
13477 ctx->args = PyObject_GetItem(ctx->dict, key);
13478 Py_DECREF(key);
13479 if (ctx->args == NULL)
13480 return -1;
13481 ctx->args_owned = 1;
13482 ctx->arglen = -1;
13483 ctx->argidx = -2;
13484 }
13485
13486 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020013487 while (--ctx->fmtcnt >= 0) {
13488 arg->ch = FORMAT_READ(ctx);
13489 ctx->fmtpos++;
13490 switch (arg->ch) {
13491 case '-': arg->flags |= F_LJUST; continue;
13492 case '+': arg->flags |= F_SIGN; continue;
13493 case ' ': arg->flags |= F_BLANK; continue;
13494 case '#': arg->flags |= F_ALT; continue;
13495 case '0': arg->flags |= F_ZERO; continue;
13496 }
13497 break;
13498 }
13499
13500 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020013501 if (arg->ch == '*') {
13502 v = unicode_format_getnextarg(ctx);
13503 if (v == NULL)
13504 return -1;
13505 if (!PyLong_Check(v)) {
13506 PyErr_SetString(PyExc_TypeError,
13507 "* wants int");
13508 return -1;
13509 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020013510 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020013511 if (arg->width == -1 && PyErr_Occurred())
13512 return -1;
13513 if (arg->width < 0) {
13514 arg->flags |= F_LJUST;
13515 arg->width = -arg->width;
13516 }
13517 if (--ctx->fmtcnt >= 0) {
13518 arg->ch = FORMAT_READ(ctx);
13519 ctx->fmtpos++;
13520 }
13521 }
13522 else if (arg->ch >= '0' && arg->ch <= '9') {
13523 arg->width = arg->ch - '0';
13524 while (--ctx->fmtcnt >= 0) {
13525 arg->ch = FORMAT_READ(ctx);
13526 ctx->fmtpos++;
13527 if (arg->ch < '0' || arg->ch > '9')
13528 break;
13529 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
13530 mixing signed and unsigned comparison. Since arg->ch is between
13531 '0' and '9', casting to int is safe. */
13532 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
13533 PyErr_SetString(PyExc_ValueError,
13534 "width too big");
13535 return -1;
13536 }
13537 arg->width = arg->width*10 + (arg->ch - '0');
13538 }
13539 }
13540
13541 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020013542 if (arg->ch == '.') {
13543 arg->prec = 0;
13544 if (--ctx->fmtcnt >= 0) {
13545 arg->ch = FORMAT_READ(ctx);
13546 ctx->fmtpos++;
13547 }
13548 if (arg->ch == '*') {
13549 v = unicode_format_getnextarg(ctx);
13550 if (v == NULL)
13551 return -1;
13552 if (!PyLong_Check(v)) {
13553 PyErr_SetString(PyExc_TypeError,
13554 "* wants int");
13555 return -1;
13556 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020013557 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020013558 if (arg->prec == -1 && PyErr_Occurred())
13559 return -1;
13560 if (arg->prec < 0)
13561 arg->prec = 0;
13562 if (--ctx->fmtcnt >= 0) {
13563 arg->ch = FORMAT_READ(ctx);
13564 ctx->fmtpos++;
13565 }
13566 }
13567 else if (arg->ch >= '0' && arg->ch <= '9') {
13568 arg->prec = arg->ch - '0';
13569 while (--ctx->fmtcnt >= 0) {
13570 arg->ch = FORMAT_READ(ctx);
13571 ctx->fmtpos++;
13572 if (arg->ch < '0' || arg->ch > '9')
13573 break;
13574 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
13575 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020013576 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020013577 return -1;
13578 }
13579 arg->prec = arg->prec*10 + (arg->ch - '0');
13580 }
13581 }
13582 }
13583
13584 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
13585 if (ctx->fmtcnt >= 0) {
13586 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
13587 if (--ctx->fmtcnt >= 0) {
13588 arg->ch = FORMAT_READ(ctx);
13589 ctx->fmtpos++;
13590 }
13591 }
13592 }
13593 if (ctx->fmtcnt < 0) {
13594 PyErr_SetString(PyExc_ValueError,
13595 "incomplete format");
13596 return -1;
13597 }
13598 return 0;
13599
13600#undef FORMAT_READ
13601}
13602
13603/* Format one argument. Supported conversion specifiers:
13604
13605 - "s", "r", "a": any type
13606 - "i", "d", "u", "o", "x", "X": int
13607 - "e", "E", "f", "F", "g", "G": float
13608 - "c": int or str (1 character)
13609
Victor Stinner8dbd4212012-12-04 09:30:24 +010013610 When possible, the output is written directly into the Unicode writer
13611 (ctx->writer). A string is created when padding is required.
13612
Victor Stinnera47082312012-10-04 02:19:54 +020013613 Return 0 if the argument has been formatted into *p_str,
13614 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010013615 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020013616static int
13617unicode_format_arg_format(struct unicode_formatter_t *ctx,
13618 struct unicode_format_arg_t *arg,
13619 PyObject **p_str)
13620{
13621 PyObject *v;
13622 _PyUnicodeWriter *writer = &ctx->writer;
13623
13624 if (ctx->fmtcnt == 0)
13625 ctx->writer.overallocate = 0;
13626
13627 if (arg->ch == '%') {
13628 if (_PyUnicodeWriter_Prepare(writer, 1, '%') == -1)
13629 return -1;
13630 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '%');
13631 writer->pos += 1;
13632 return 1;
13633 }
13634
13635 v = unicode_format_getnextarg(ctx);
13636 if (v == NULL)
13637 return -1;
13638
Victor Stinnera47082312012-10-04 02:19:54 +020013639
13640 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020013641 case 's':
13642 case 'r':
13643 case 'a':
13644 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
13645 /* Fast path */
13646 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
13647 return -1;
13648 return 1;
13649 }
13650
13651 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
13652 *p_str = v;
13653 Py_INCREF(*p_str);
13654 }
13655 else {
13656 if (arg->ch == 's')
13657 *p_str = PyObject_Str(v);
13658 else if (arg->ch == 'r')
13659 *p_str = PyObject_Repr(v);
13660 else
13661 *p_str = PyObject_ASCII(v);
13662 }
13663 break;
13664
13665 case 'i':
13666 case 'd':
13667 case 'u':
13668 case 'o':
13669 case 'x':
13670 case 'X':
13671 {
13672 int ret = mainformatlong(v, arg, p_str, writer);
13673 if (ret != 0)
13674 return ret;
13675 arg->sign = 1;
13676 break;
13677 }
13678
13679 case 'e':
13680 case 'E':
13681 case 'f':
13682 case 'F':
13683 case 'g':
13684 case 'G':
13685 if (arg->width == -1 && arg->prec == -1
13686 && !(arg->flags & (F_SIGN | F_BLANK)))
13687 {
13688 /* Fast path */
13689 if (formatfloat(v, arg, NULL, writer) == -1)
13690 return -1;
13691 return 1;
13692 }
13693
13694 arg->sign = 1;
13695 if (formatfloat(v, arg, p_str, NULL) == -1)
13696 return -1;
13697 break;
13698
13699 case 'c':
13700 {
13701 Py_UCS4 ch = formatchar(v);
13702 if (ch == (Py_UCS4) -1)
13703 return -1;
13704 if (arg->width == -1 && arg->prec == -1) {
13705 /* Fast path */
13706 if (_PyUnicodeWriter_Prepare(writer, 1, ch) == -1)
13707 return -1;
13708 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13709 writer->pos += 1;
13710 return 1;
13711 }
13712 *p_str = PyUnicode_FromOrdinal(ch);
13713 break;
13714 }
13715
13716 default:
13717 PyErr_Format(PyExc_ValueError,
13718 "unsupported format character '%c' (0x%x) "
13719 "at index %zd",
13720 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
13721 (int)arg->ch,
13722 ctx->fmtpos - 1);
13723 return -1;
13724 }
13725 if (*p_str == NULL)
13726 return -1;
13727 assert (PyUnicode_Check(*p_str));
13728 return 0;
13729}
13730
13731static int
13732unicode_format_arg_output(struct unicode_formatter_t *ctx,
13733 struct unicode_format_arg_t *arg,
13734 PyObject *str)
13735{
13736 Py_ssize_t len;
13737 enum PyUnicode_Kind kind;
13738 void *pbuf;
13739 Py_ssize_t pindex;
13740 Py_UCS4 signchar;
13741 Py_ssize_t buflen;
13742 Py_UCS4 maxchar, bufmaxchar;
13743 Py_ssize_t sublen;
13744 _PyUnicodeWriter *writer = &ctx->writer;
13745 Py_UCS4 fill;
13746
13747 fill = ' ';
13748 if (arg->sign && arg->flags & F_ZERO)
13749 fill = '0';
13750
13751 if (PyUnicode_READY(str) == -1)
13752 return -1;
13753
13754 len = PyUnicode_GET_LENGTH(str);
13755 if ((arg->width == -1 || arg->width <= len)
13756 && (arg->prec == -1 || arg->prec >= len)
13757 && !(arg->flags & (F_SIGN | F_BLANK)))
13758 {
13759 /* Fast path */
13760 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
13761 return -1;
13762 return 0;
13763 }
13764
13765 /* Truncate the string for "s", "r" and "a" formats
13766 if the precision is set */
13767 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
13768 if (arg->prec >= 0 && len > arg->prec)
13769 len = arg->prec;
13770 }
13771
13772 /* Adjust sign and width */
13773 kind = PyUnicode_KIND(str);
13774 pbuf = PyUnicode_DATA(str);
13775 pindex = 0;
13776 signchar = '\0';
13777 if (arg->sign) {
13778 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
13779 if (ch == '-' || ch == '+') {
13780 signchar = ch;
13781 len--;
13782 pindex++;
13783 }
13784 else if (arg->flags & F_SIGN)
13785 signchar = '+';
13786 else if (arg->flags & F_BLANK)
13787 signchar = ' ';
13788 else
13789 arg->sign = 0;
13790 }
13791 if (arg->width < len)
13792 arg->width = len;
13793
13794 /* Prepare the writer */
13795 bufmaxchar = 127;
13796 if (!(arg->flags & F_LJUST)) {
13797 if (arg->sign) {
13798 if ((arg->width-1) > len)
13799 bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill);
13800 }
13801 else {
13802 if (arg->width > len)
13803 bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill);
13804 }
13805 }
13806 maxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
13807 bufmaxchar = MAX_MAXCHAR(bufmaxchar, maxchar);
13808 buflen = arg->width;
13809 if (arg->sign && len == arg->width)
13810 buflen++;
13811 if (_PyUnicodeWriter_Prepare(writer, buflen, bufmaxchar) == -1)
13812 return -1;
13813
13814 /* Write the sign if needed */
13815 if (arg->sign) {
13816 if (fill != ' ') {
13817 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
13818 writer->pos += 1;
13819 }
13820 if (arg->width > len)
13821 arg->width--;
13822 }
13823
13824 /* Write the numeric prefix for "x", "X" and "o" formats
13825 if the alternate form is used.
13826 For example, write "0x" for the "%#x" format. */
13827 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
13828 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13829 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
13830 if (fill != ' ') {
13831 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
13832 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
13833 writer->pos += 2;
13834 pindex += 2;
13835 }
13836 arg->width -= 2;
13837 if (arg->width < 0)
13838 arg->width = 0;
13839 len -= 2;
13840 }
13841
13842 /* Pad left with the fill character if needed */
13843 if (arg->width > len && !(arg->flags & F_LJUST)) {
13844 sublen = arg->width - len;
13845 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
13846 writer->pos += sublen;
13847 arg->width = len;
13848 }
13849
13850 /* If padding with spaces: write sign if needed and/or numeric prefix if
13851 the alternate form is used */
13852 if (fill == ' ') {
13853 if (arg->sign) {
13854 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
13855 writer->pos += 1;
13856 }
13857 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
13858 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13859 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
13860 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
13861 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
13862 writer->pos += 2;
13863 pindex += 2;
13864 }
13865 }
13866
13867 /* Write characters */
13868 if (len) {
13869 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13870 str, pindex, len);
13871 writer->pos += len;
13872 }
13873
13874 /* Pad right with the fill character if needed */
13875 if (arg->width > len) {
13876 sublen = arg->width - len;
13877 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
13878 writer->pos += sublen;
13879 }
13880 return 0;
13881}
13882
13883/* Helper of PyUnicode_Format(): format one arg.
13884 Return 0 on success, raise an exception and return -1 on error. */
13885static int
13886unicode_format_arg(struct unicode_formatter_t *ctx)
13887{
13888 struct unicode_format_arg_t arg;
13889 PyObject *str;
13890 int ret;
13891
Victor Stinner8dbd4212012-12-04 09:30:24 +010013892 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
13893 arg.flags = 0;
13894 arg.width = -1;
13895 arg.prec = -1;
13896 arg.sign = 0;
13897 str = NULL;
13898
Victor Stinnera47082312012-10-04 02:19:54 +020013899 ret = unicode_format_arg_parse(ctx, &arg);
13900 if (ret == -1)
13901 return -1;
13902
13903 ret = unicode_format_arg_format(ctx, &arg, &str);
13904 if (ret == -1)
13905 return -1;
13906
13907 if (ret != 1) {
13908 ret = unicode_format_arg_output(ctx, &arg, str);
13909 Py_DECREF(str);
13910 if (ret == -1)
13911 return -1;
13912 }
13913
13914 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
13915 PyErr_SetString(PyExc_TypeError,
13916 "not all arguments converted during string formatting");
13917 return -1;
13918 }
13919 return 0;
13920}
13921
Alexander Belopolsky40018472011-02-26 01:02:56 +000013922PyObject *
13923PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013924{
Victor Stinnera47082312012-10-04 02:19:54 +020013925 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000013926
Guido van Rossumd57fd912000-03-10 22:53:23 +000013927 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013928 PyErr_BadInternalCall();
13929 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013930 }
Victor Stinnera47082312012-10-04 02:19:54 +020013931
13932 ctx.fmtstr = PyUnicode_FromObject(format);
13933 if (ctx.fmtstr == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013934 return NULL;
Victor Stinnera47082312012-10-04 02:19:54 +020013935 if (PyUnicode_READY(ctx.fmtstr) == -1) {
13936 Py_DECREF(ctx.fmtstr);
13937 return NULL;
13938 }
13939 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
13940 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
13941 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
13942 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013943
Victor Stinnera47082312012-10-04 02:19:54 +020013944 _PyUnicodeWriter_Init(&ctx.writer, ctx.fmtcnt + 100);
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013945
Guido van Rossumd57fd912000-03-10 22:53:23 +000013946 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020013947 ctx.arglen = PyTuple_Size(args);
13948 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013949 }
13950 else {
Victor Stinnera47082312012-10-04 02:19:54 +020013951 ctx.arglen = -1;
13952 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013953 }
Victor Stinnera47082312012-10-04 02:19:54 +020013954 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040013955 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020013956 ctx.dict = args;
13957 else
13958 ctx.dict = NULL;
13959 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013960
Victor Stinnera47082312012-10-04 02:19:54 +020013961 while (--ctx.fmtcnt >= 0) {
13962 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
13963 Py_ssize_t nonfmtpos, sublen;
13964 Py_UCS4 maxchar;
13965
13966 nonfmtpos = ctx.fmtpos++;
13967 while (ctx.fmtcnt >= 0 &&
13968 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
13969 ctx.fmtpos++;
13970 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013971 }
Victor Stinnera47082312012-10-04 02:19:54 +020013972 if (ctx.fmtcnt < 0) {
13973 ctx.fmtpos--;
13974 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020013975 }
Victor Stinnera47082312012-10-04 02:19:54 +020013976 sublen = ctx.fmtpos - nonfmtpos;
13977 maxchar = _PyUnicode_FindMaxChar(ctx.fmtstr,
Victor Stinneree4544c2012-05-09 22:24:08 +020013978 nonfmtpos, nonfmtpos + sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020013979 if (_PyUnicodeWriter_Prepare(&ctx.writer, sublen, maxchar) == -1)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013980 goto onError;
Victor Stinneree4544c2012-05-09 22:24:08 +020013981
Victor Stinnera47082312012-10-04 02:19:54 +020013982 _PyUnicode_FastCopyCharacters(ctx.writer.buffer, ctx.writer.pos,
13983 ctx.fmtstr, nonfmtpos, sublen);
13984 ctx.writer.pos += sublen;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013985 }
13986 else {
Victor Stinnera47082312012-10-04 02:19:54 +020013987 ctx.fmtpos++;
13988 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013989 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020013990 }
13991 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020013992
Victor Stinnera47082312012-10-04 02:19:54 +020013993 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013994 PyErr_SetString(PyExc_TypeError,
13995 "not all arguments converted during string formatting");
13996 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013997 }
13998
Victor Stinnera47082312012-10-04 02:19:54 +020013999 if (ctx.args_owned) {
14000 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014001 }
Victor Stinnera47082312012-10-04 02:19:54 +020014002 Py_DECREF(ctx.fmtstr);
14003 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014004
Benjamin Peterson29060642009-01-31 22:14:21 +000014005 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014006 Py_DECREF(ctx.fmtstr);
14007 _PyUnicodeWriter_Dealloc(&ctx.writer);
14008 if (ctx.args_owned) {
14009 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014010 }
14011 return NULL;
14012}
14013
Jeremy Hylton938ace62002-07-17 16:30:39 +000014014static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014015unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14016
Tim Peters6d6c1a32001-08-02 04:15:00 +000014017static PyObject *
14018unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14019{
Benjamin Peterson29060642009-01-31 22:14:21 +000014020 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014021 static char *kwlist[] = {"object", "encoding", "errors", 0};
14022 char *encoding = NULL;
14023 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014024
Benjamin Peterson14339b62009-01-31 16:36:08 +000014025 if (type != &PyUnicode_Type)
14026 return unicode_subtype_new(type, args, kwds);
14027 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014028 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014029 return NULL;
14030 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014031 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014032 if (encoding == NULL && errors == NULL)
14033 return PyObject_Str(x);
14034 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014035 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014036}
14037
Guido van Rossume023fe02001-08-30 03:12:59 +000014038static PyObject *
14039unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14040{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014041 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014042 Py_ssize_t length, char_size;
14043 int share_wstr, share_utf8;
14044 unsigned int kind;
14045 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014046
Benjamin Peterson14339b62009-01-31 16:36:08 +000014047 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014048
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014049 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014050 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014051 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014052 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014053 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014054 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014055 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014056 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014057
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014058 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014059 if (self == NULL) {
14060 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014061 return NULL;
14062 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014063 kind = PyUnicode_KIND(unicode);
14064 length = PyUnicode_GET_LENGTH(unicode);
14065
14066 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014067#ifdef Py_DEBUG
14068 _PyUnicode_HASH(self) = -1;
14069#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014070 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014071#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014072 _PyUnicode_STATE(self).interned = 0;
14073 _PyUnicode_STATE(self).kind = kind;
14074 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014075 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014076 _PyUnicode_STATE(self).ready = 1;
14077 _PyUnicode_WSTR(self) = NULL;
14078 _PyUnicode_UTF8_LENGTH(self) = 0;
14079 _PyUnicode_UTF8(self) = NULL;
14080 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014081 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014082
14083 share_utf8 = 0;
14084 share_wstr = 0;
14085 if (kind == PyUnicode_1BYTE_KIND) {
14086 char_size = 1;
14087 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14088 share_utf8 = 1;
14089 }
14090 else if (kind == PyUnicode_2BYTE_KIND) {
14091 char_size = 2;
14092 if (sizeof(wchar_t) == 2)
14093 share_wstr = 1;
14094 }
14095 else {
14096 assert(kind == PyUnicode_4BYTE_KIND);
14097 char_size = 4;
14098 if (sizeof(wchar_t) == 4)
14099 share_wstr = 1;
14100 }
14101
14102 /* Ensure we won't overflow the length. */
14103 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14104 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014105 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014106 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014107 data = PyObject_MALLOC((length + 1) * char_size);
14108 if (data == NULL) {
14109 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014110 goto onError;
14111 }
14112
Victor Stinnerc3c74152011-10-02 20:39:55 +020014113 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014114 if (share_utf8) {
14115 _PyUnicode_UTF8_LENGTH(self) = length;
14116 _PyUnicode_UTF8(self) = data;
14117 }
14118 if (share_wstr) {
14119 _PyUnicode_WSTR_LENGTH(self) = length;
14120 _PyUnicode_WSTR(self) = (wchar_t *)data;
14121 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014122
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014123 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014124 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014125 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014126#ifdef Py_DEBUG
14127 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14128#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014129 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014130 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014131
14132onError:
14133 Py_DECREF(unicode);
14134 Py_DECREF(self);
14135 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014136}
14137
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014138PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014139"str(object='') -> str\n\
14140str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014141\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014142Create a new string object from the given object. If encoding or\n\
14143errors is specified, then the object must expose a data buffer\n\
14144that will be decoded using the given encoding and error handler.\n\
14145Otherwise, returns the result of object.__str__() (if defined)\n\
14146or repr(object).\n\
14147encoding defaults to sys.getdefaultencoding().\n\
14148errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014149
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014150static PyObject *unicode_iter(PyObject *seq);
14151
Guido van Rossumd57fd912000-03-10 22:53:23 +000014152PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014153 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014154 "str", /* tp_name */
14155 sizeof(PyUnicodeObject), /* tp_size */
14156 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014157 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014158 (destructor)unicode_dealloc, /* tp_dealloc */
14159 0, /* tp_print */
14160 0, /* tp_getattr */
14161 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014162 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014163 unicode_repr, /* tp_repr */
14164 &unicode_as_number, /* tp_as_number */
14165 &unicode_as_sequence, /* tp_as_sequence */
14166 &unicode_as_mapping, /* tp_as_mapping */
14167 (hashfunc) unicode_hash, /* tp_hash*/
14168 0, /* tp_call*/
14169 (reprfunc) unicode_str, /* tp_str */
14170 PyObject_GenericGetAttr, /* tp_getattro */
14171 0, /* tp_setattro */
14172 0, /* tp_as_buffer */
14173 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014174 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014175 unicode_doc, /* tp_doc */
14176 0, /* tp_traverse */
14177 0, /* tp_clear */
14178 PyUnicode_RichCompare, /* tp_richcompare */
14179 0, /* tp_weaklistoffset */
14180 unicode_iter, /* tp_iter */
14181 0, /* tp_iternext */
14182 unicode_methods, /* tp_methods */
14183 0, /* tp_members */
14184 0, /* tp_getset */
14185 &PyBaseObject_Type, /* tp_base */
14186 0, /* tp_dict */
14187 0, /* tp_descr_get */
14188 0, /* tp_descr_set */
14189 0, /* tp_dictoffset */
14190 0, /* tp_init */
14191 0, /* tp_alloc */
14192 unicode_new, /* tp_new */
14193 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014194};
14195
14196/* Initialize the Unicode implementation */
14197
Victor Stinner3a50e702011-10-18 21:21:00 +020014198int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014199{
Thomas Wouters477c8d52006-05-27 19:21:47 +000014200 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014201 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014202 0x000A, /* LINE FEED */
14203 0x000D, /* CARRIAGE RETURN */
14204 0x001C, /* FILE SEPARATOR */
14205 0x001D, /* GROUP SEPARATOR */
14206 0x001E, /* RECORD SEPARATOR */
14207 0x0085, /* NEXT LINE */
14208 0x2028, /* LINE SEPARATOR */
14209 0x2029, /* PARAGRAPH SEPARATOR */
14210 };
14211
Fred Drakee4315f52000-05-09 19:53:39 +000014212 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020014213 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014214 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014215 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020014216 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014217
Guido van Rossumcacfc072002-05-24 19:01:59 +000014218 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014219 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014220
14221 /* initialize the linebreak bloom filter */
14222 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014223 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014224 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014225
14226 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020014227
Benjamin Petersonc4311282012-10-30 23:21:10 -040014228 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
14229 Py_FatalError("Can't initialize field name iterator type");
14230
14231 if (PyType_Ready(&PyFormatterIter_Type) < 0)
14232 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040014233
Victor Stinner3a50e702011-10-18 21:21:00 +020014234#ifdef HAVE_MBCS
14235 winver.dwOSVersionInfoSize = sizeof(winver);
14236 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14237 PyErr_SetFromWindowsErr(0);
14238 return -1;
14239 }
14240#endif
14241 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014242}
14243
14244/* Finalize the Unicode implementation */
14245
Christian Heimesa156e092008-02-16 07:38:31 +000014246int
14247PyUnicode_ClearFreeList(void)
14248{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014249 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014250}
14251
Guido van Rossumd57fd912000-03-10 22:53:23 +000014252void
Thomas Wouters78890102000-07-22 19:25:51 +000014253_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014254{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014255 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014256
Serhiy Storchaka05997252013-01-26 12:14:02 +020014257 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014258
Serhiy Storchaka05997252013-01-26 12:14:02 +020014259 for (i = 0; i < 256; i++)
14260 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014261 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014262 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014263}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014264
Walter Dörwald16807132007-05-25 13:52:07 +000014265void
14266PyUnicode_InternInPlace(PyObject **p)
14267{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014268 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014269 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014270#ifdef Py_DEBUG
14271 assert(s != NULL);
14272 assert(_PyUnicode_CHECK(s));
14273#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014274 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014275 return;
14276#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014277 /* If it's a subclass, we don't really know what putting
14278 it in the interned dict might do. */
14279 if (!PyUnicode_CheckExact(s))
14280 return;
14281 if (PyUnicode_CHECK_INTERNED(s))
14282 return;
14283 if (interned == NULL) {
14284 interned = PyDict_New();
14285 if (interned == NULL) {
14286 PyErr_Clear(); /* Don't leave an exception */
14287 return;
14288 }
14289 }
14290 /* It might be that the GetItem call fails even
14291 though the key is present in the dictionary,
14292 namely when this happens during a stack overflow. */
14293 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014294 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014295 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014296
Benjamin Peterson29060642009-01-31 22:14:21 +000014297 if (t) {
14298 Py_INCREF(t);
14299 Py_DECREF(*p);
14300 *p = t;
14301 return;
14302 }
Walter Dörwald16807132007-05-25 13:52:07 +000014303
Benjamin Peterson14339b62009-01-31 16:36:08 +000014304 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014305 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014306 PyErr_Clear();
14307 PyThreadState_GET()->recursion_critical = 0;
14308 return;
14309 }
14310 PyThreadState_GET()->recursion_critical = 0;
14311 /* The two references in interned are not counted by refcnt.
14312 The deallocator will take care of this */
14313 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014314 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014315}
14316
14317void
14318PyUnicode_InternImmortal(PyObject **p)
14319{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014320 PyUnicode_InternInPlace(p);
14321 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014322 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014323 Py_INCREF(*p);
14324 }
Walter Dörwald16807132007-05-25 13:52:07 +000014325}
14326
14327PyObject *
14328PyUnicode_InternFromString(const char *cp)
14329{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014330 PyObject *s = PyUnicode_FromString(cp);
14331 if (s == NULL)
14332 return NULL;
14333 PyUnicode_InternInPlace(&s);
14334 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014335}
14336
Alexander Belopolsky40018472011-02-26 01:02:56 +000014337void
14338_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014339{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014340 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014341 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014342 Py_ssize_t i, n;
14343 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014344
Benjamin Peterson14339b62009-01-31 16:36:08 +000014345 if (interned == NULL || !PyDict_Check(interned))
14346 return;
14347 keys = PyDict_Keys(interned);
14348 if (keys == NULL || !PyList_Check(keys)) {
14349 PyErr_Clear();
14350 return;
14351 }
Walter Dörwald16807132007-05-25 13:52:07 +000014352
Benjamin Peterson14339b62009-01-31 16:36:08 +000014353 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14354 detector, interned unicode strings are not forcibly deallocated;
14355 rather, we give them their stolen references back, and then clear
14356 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014357
Benjamin Peterson14339b62009-01-31 16:36:08 +000014358 n = PyList_GET_SIZE(keys);
14359 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014360 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014361 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014362 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014363 if (PyUnicode_READY(s) == -1) {
14364 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014365 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014366 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014367 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014368 case SSTATE_NOT_INTERNED:
14369 /* XXX Shouldn't happen */
14370 break;
14371 case SSTATE_INTERNED_IMMORTAL:
14372 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014373 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014374 break;
14375 case SSTATE_INTERNED_MORTAL:
14376 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014377 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014378 break;
14379 default:
14380 Py_FatalError("Inconsistent interned string state.");
14381 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014382 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014383 }
14384 fprintf(stderr, "total size of all interned strings: "
14385 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14386 "mortal/immortal\n", mortal_size, immortal_size);
14387 Py_DECREF(keys);
14388 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020014389 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000014390}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014391
14392
14393/********************* Unicode Iterator **************************/
14394
14395typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014396 PyObject_HEAD
14397 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014398 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014399} unicodeiterobject;
14400
14401static void
14402unicodeiter_dealloc(unicodeiterobject *it)
14403{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014404 _PyObject_GC_UNTRACK(it);
14405 Py_XDECREF(it->it_seq);
14406 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014407}
14408
14409static int
14410unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14411{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014412 Py_VISIT(it->it_seq);
14413 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014414}
14415
14416static PyObject *
14417unicodeiter_next(unicodeiterobject *it)
14418{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014419 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014420
Benjamin Peterson14339b62009-01-31 16:36:08 +000014421 assert(it != NULL);
14422 seq = it->it_seq;
14423 if (seq == NULL)
14424 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014425 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014426
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014427 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14428 int kind = PyUnicode_KIND(seq);
14429 void *data = PyUnicode_DATA(seq);
14430 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14431 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014432 if (item != NULL)
14433 ++it->it_index;
14434 return item;
14435 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014436
Benjamin Peterson14339b62009-01-31 16:36:08 +000014437 Py_DECREF(seq);
14438 it->it_seq = NULL;
14439 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014440}
14441
14442static PyObject *
14443unicodeiter_len(unicodeiterobject *it)
14444{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014445 Py_ssize_t len = 0;
14446 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014447 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014448 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014449}
14450
14451PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14452
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014453static PyObject *
14454unicodeiter_reduce(unicodeiterobject *it)
14455{
14456 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020014457 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014458 it->it_seq, it->it_index);
14459 } else {
14460 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
14461 if (u == NULL)
14462 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020014463 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014464 }
14465}
14466
14467PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14468
14469static PyObject *
14470unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
14471{
14472 Py_ssize_t index = PyLong_AsSsize_t(state);
14473 if (index == -1 && PyErr_Occurred())
14474 return NULL;
14475 if (index < 0)
14476 index = 0;
14477 it->it_index = index;
14478 Py_RETURN_NONE;
14479}
14480
14481PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14482
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014483static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014484 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014485 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014486 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
14487 reduce_doc},
14488 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
14489 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014490 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014491};
14492
14493PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014494 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14495 "str_iterator", /* tp_name */
14496 sizeof(unicodeiterobject), /* tp_basicsize */
14497 0, /* tp_itemsize */
14498 /* methods */
14499 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14500 0, /* tp_print */
14501 0, /* tp_getattr */
14502 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014503 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014504 0, /* tp_repr */
14505 0, /* tp_as_number */
14506 0, /* tp_as_sequence */
14507 0, /* tp_as_mapping */
14508 0, /* tp_hash */
14509 0, /* tp_call */
14510 0, /* tp_str */
14511 PyObject_GenericGetAttr, /* tp_getattro */
14512 0, /* tp_setattro */
14513 0, /* tp_as_buffer */
14514 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14515 0, /* tp_doc */
14516 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14517 0, /* tp_clear */
14518 0, /* tp_richcompare */
14519 0, /* tp_weaklistoffset */
14520 PyObject_SelfIter, /* tp_iter */
14521 (iternextfunc)unicodeiter_next, /* tp_iternext */
14522 unicodeiter_methods, /* tp_methods */
14523 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014524};
14525
14526static PyObject *
14527unicode_iter(PyObject *seq)
14528{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014529 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014530
Benjamin Peterson14339b62009-01-31 16:36:08 +000014531 if (!PyUnicode_Check(seq)) {
14532 PyErr_BadInternalCall();
14533 return NULL;
14534 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014535 if (PyUnicode_READY(seq) == -1)
14536 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014537 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14538 if (it == NULL)
14539 return NULL;
14540 it->it_index = 0;
14541 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014542 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014543 _PyObject_GC_TRACK(it);
14544 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014545}
14546
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014547
14548size_t
14549Py_UNICODE_strlen(const Py_UNICODE *u)
14550{
14551 int res = 0;
14552 while(*u++)
14553 res++;
14554 return res;
14555}
14556
14557Py_UNICODE*
14558Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14559{
14560 Py_UNICODE *u = s1;
14561 while ((*u++ = *s2++));
14562 return s1;
14563}
14564
14565Py_UNICODE*
14566Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14567{
14568 Py_UNICODE *u = s1;
14569 while ((*u++ = *s2++))
14570 if (n-- == 0)
14571 break;
14572 return s1;
14573}
14574
14575Py_UNICODE*
14576Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14577{
14578 Py_UNICODE *u1 = s1;
14579 u1 += Py_UNICODE_strlen(u1);
14580 Py_UNICODE_strcpy(u1, s2);
14581 return s1;
14582}
14583
14584int
14585Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14586{
14587 while (*s1 && *s2 && *s1 == *s2)
14588 s1++, s2++;
14589 if (*s1 && *s2)
14590 return (*s1 < *s2) ? -1 : +1;
14591 if (*s1)
14592 return 1;
14593 if (*s2)
14594 return -1;
14595 return 0;
14596}
14597
14598int
14599Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14600{
14601 register Py_UNICODE u1, u2;
14602 for (; n != 0; n--) {
14603 u1 = *s1;
14604 u2 = *s2;
14605 if (u1 != u2)
14606 return (u1 < u2) ? -1 : +1;
14607 if (u1 == '\0')
14608 return 0;
14609 s1++;
14610 s2++;
14611 }
14612 return 0;
14613}
14614
14615Py_UNICODE*
14616Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14617{
14618 const Py_UNICODE *p;
14619 for (p = s; *p; p++)
14620 if (*p == c)
14621 return (Py_UNICODE*)p;
14622 return NULL;
14623}
14624
14625Py_UNICODE*
14626Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14627{
14628 const Py_UNICODE *p;
14629 p = s + Py_UNICODE_strlen(s);
14630 while (p != s) {
14631 p--;
14632 if (*p == c)
14633 return (Py_UNICODE*)p;
14634 }
14635 return NULL;
14636}
Victor Stinner331ea922010-08-10 16:37:20 +000014637
Victor Stinner71133ff2010-09-01 23:43:53 +000014638Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014639PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014640{
Victor Stinner577db2c2011-10-11 22:12:48 +020014641 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014642 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014643
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014644 if (!PyUnicode_Check(unicode)) {
14645 PyErr_BadArgument();
14646 return NULL;
14647 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014648 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014649 if (u == NULL)
14650 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014651 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014652 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014653 PyErr_NoMemory();
14654 return NULL;
14655 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014656 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014657 size *= sizeof(Py_UNICODE);
14658 copy = PyMem_Malloc(size);
14659 if (copy == NULL) {
14660 PyErr_NoMemory();
14661 return NULL;
14662 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014663 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014664 return copy;
14665}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014666
Georg Brandl66c221e2010-10-14 07:04:07 +000014667/* A _string module, to export formatter_parser and formatter_field_name_split
14668 to the string.Formatter class implemented in Python. */
14669
14670static PyMethodDef _string_methods[] = {
14671 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14672 METH_O, PyDoc_STR("split the argument as a field name")},
14673 {"formatter_parser", (PyCFunction) formatter_parser,
14674 METH_O, PyDoc_STR("parse the argument as a format string")},
14675 {NULL, NULL}
14676};
14677
14678static struct PyModuleDef _string_module = {
14679 PyModuleDef_HEAD_INIT,
14680 "_string",
14681 PyDoc_STR("string helper module"),
14682 0,
14683 _string_methods,
14684 NULL,
14685 NULL,
14686 NULL,
14687 NULL
14688};
14689
14690PyMODINIT_FUNC
14691PyInit__string(void)
14692{
14693 return PyModule_Create(&_string_module);
14694}
14695
14696
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014697#ifdef __cplusplus
14698}
14699#endif