blob: 21cafb32515118234f0333f49ca9b05fa829d501 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Victor Stinnerce5faf62011-10-05 00:42:43 +020049#ifdef Py_DEBUG
50# define DONT_MAKE_RESULT_READY
51#endif
52
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Victor Stinner910337b2011-10-03 03:20:16 +020096#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020097# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020098#else
99# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
100#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200101
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102#define _PyUnicode_UTF8(op) \
103 (((PyCompactUnicodeObject*)(op))->utf8)
104#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200105 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200106 assert(PyUnicode_IS_READY(op)), \
107 PyUnicode_IS_COMPACT_ASCII(op) ? \
108 ((char*)((PyASCIIObject*)(op) + 1)) : \
109 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200110#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200111 (((PyCompactUnicodeObject*)(op))->utf8_length)
112#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200113 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200114 assert(PyUnicode_IS_READY(op)), \
115 PyUnicode_IS_COMPACT_ASCII(op) ? \
116 ((PyASCIIObject*)(op))->length : \
117 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200118#define _PyUnicode_WSTR(op) \
119 (((PyASCIIObject*)(op))->wstr)
120#define _PyUnicode_WSTR_LENGTH(op) \
121 (((PyCompactUnicodeObject*)(op))->wstr_length)
122#define _PyUnicode_LENGTH(op) \
123 (((PyASCIIObject *)(op))->length)
124#define _PyUnicode_STATE(op) \
125 (((PyASCIIObject *)(op))->state)
126#define _PyUnicode_HASH(op) \
127 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200128#define _PyUnicode_KIND(op) \
129 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200130 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200131#define _PyUnicode_GET_LENGTH(op) \
132 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200133 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200134#define _PyUnicode_DATA_ANY(op) \
135 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200136
Victor Stinner910337b2011-10-03 03:20:16 +0200137#undef PyUnicode_READY
138#define PyUnicode_READY(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200141 0 : \
142 _PyUnicode_Ready((PyObject *)(op))))
Victor Stinner910337b2011-10-03 03:20:16 +0200143
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200144#define _PyUnicode_READY_REPLACE(p_obj) \
145 (assert(_PyUnicode_CHECK(*p_obj)), \
146 (PyUnicode_IS_READY(*p_obj) ? \
147 0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj))))
148
Victor Stinnerc379ead2011-10-03 12:52:27 +0200149#define _PyUnicode_SHARE_UTF8(op) \
150 (assert(_PyUnicode_CHECK(op)), \
151 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
152 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
153#define _PyUnicode_SHARE_WSTR(op) \
154 (assert(_PyUnicode_CHECK(op)), \
155 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
156
Victor Stinner829c0ad2011-10-03 01:08:02 +0200157/* true if the Unicode object has an allocated UTF-8 memory block
158 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200159#define _PyUnicode_HAS_UTF8_MEMORY(op) \
160 (assert(_PyUnicode_CHECK(op)), \
161 (!PyUnicode_IS_COMPACT_ASCII(op) \
162 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200163 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
164
Victor Stinner03490912011-10-03 23:45:12 +0200165/* true if the Unicode object has an allocated wstr memory block
166 (not shared with other data) */
167#define _PyUnicode_HAS_WSTR_MEMORY(op) \
168 (assert(_PyUnicode_CHECK(op)), \
169 (_PyUnicode_WSTR(op) && \
170 (!PyUnicode_IS_READY(op) || \
171 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
172
Victor Stinner910337b2011-10-03 03:20:16 +0200173/* Generic helper macro to convert characters of different types.
174 from_type and to_type have to be valid type names, begin and end
175 are pointers to the source characters which should be of type
176 "from_type *". to is a pointer of type "to_type *" and points to the
177 buffer where the result characters are written to. */
178#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
179 do { \
180 const from_type *iter_; to_type *to_; \
181 for (iter_ = (begin), to_ = (to_type *)(to); \
182 iter_ < (end); \
183 ++iter_, ++to_) { \
184 *to_ = (to_type)*iter_; \
185 } \
186 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200187
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200188/* The Unicode string has been modified: reset the hash */
189#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
190
Walter Dörwald16807132007-05-25 13:52:07 +0000191/* This dictionary holds all interned unicode strings. Note that references
192 to strings in this dictionary are *not* counted in the string's ob_refcnt.
193 When the interned string reaches a refcnt of 0 the string deallocation
194 function will delete the reference from this dictionary.
195
196 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000197 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000198*/
199static PyObject *interned;
200
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000201/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200202static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000203
204/* Single character Unicode strings in the Latin-1 range are being
205 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200206static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000207
Christian Heimes190d79e2008-01-30 11:58:22 +0000208/* Fast detection of the most frequent whitespace characters */
209const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000211/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000212/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000213/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000214/* case 0x000C: * FORM FEED */
215/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000216 0, 1, 1, 1, 1, 1, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000218/* case 0x001C: * FILE SEPARATOR */
219/* case 0x001D: * GROUP SEPARATOR */
220/* case 0x001E: * RECORD SEPARATOR */
221/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000222 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000223/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000224 1, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000228
Benjamin Peterson14339b62009-01-31 16:36:08 +0000229 0, 0, 0, 0, 0, 0, 0, 0,
230 0, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0,
232 0, 0, 0, 0, 0, 0, 0, 0,
233 0, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000237};
238
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200239/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200240static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200241static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200242static void copy_characters(
243 PyObject *to, Py_ssize_t to_start,
244 PyObject *from, Py_ssize_t from_start,
245 Py_ssize_t how_many);
Victor Stinnerc729b8e2011-10-06 02:36:59 +0200246#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200247static int unicode_is_singleton(PyObject *unicode);
Victor Stinnerc729b8e2011-10-06 02:36:59 +0200248#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200249
Alexander Belopolsky40018472011-02-26 01:02:56 +0000250static PyObject *
251unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000252 PyObject **errorHandler,const char *encoding, const char *reason,
253 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
254 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
255
Alexander Belopolsky40018472011-02-26 01:02:56 +0000256static void
257raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300258 const char *encoding,
259 const Py_UNICODE *unicode, Py_ssize_t size,
260 Py_ssize_t startpos, Py_ssize_t endpos,
261 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000262
Christian Heimes190d79e2008-01-30 11:58:22 +0000263/* Same for linebreaks */
264static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000265 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000266/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000267/* 0x000B, * LINE TABULATION */
268/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000269/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000270 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000271 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000272/* 0x001C, * FILE SEPARATOR */
273/* 0x001D, * GROUP SEPARATOR */
274/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000275 0, 0, 0, 0, 1, 1, 1, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
277 0, 0, 0, 0, 0, 0, 0, 0,
278 0, 0, 0, 0, 0, 0, 0, 0,
279 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000280
Benjamin Peterson14339b62009-01-31 16:36:08 +0000281 0, 0, 0, 0, 0, 0, 0, 0,
282 0, 0, 0, 0, 0, 0, 0, 0,
283 0, 0, 0, 0, 0, 0, 0, 0,
284 0, 0, 0, 0, 0, 0, 0, 0,
285 0, 0, 0, 0, 0, 0, 0, 0,
286 0, 0, 0, 0, 0, 0, 0, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000289};
290
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300291/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
292 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000293Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000294PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000295{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000296#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000297 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000298#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000299 /* This is actually an illegal character, so it should
300 not be passed to unichr. */
301 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000302#endif
303}
304
Victor Stinner910337b2011-10-03 03:20:16 +0200305#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200306int
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200307/* FIXME: use PyObject* type for op */
308_PyUnicode_CheckConsistency(void *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200309{
310 PyASCIIObject *ascii;
311 unsigned int kind;
312
313 assert(PyUnicode_Check(op));
314
315 ascii = (PyASCIIObject *)op;
316 kind = ascii->state.kind;
317
Victor Stinnera3b334d2011-10-03 13:53:37 +0200318 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200319 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200320 assert(ascii->state.ready == 1);
321 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200322 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200323 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200324 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200325
Victor Stinnera41463c2011-10-04 01:05:08 +0200326 if (ascii->state.compact == 1) {
327 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200328 assert(kind == PyUnicode_1BYTE_KIND
329 || kind == PyUnicode_2BYTE_KIND
330 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200331 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200332 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200333 assert (compact->utf8 != data);
334 } else {
335 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
336
337 data = unicode->data.any;
338 if (kind == PyUnicode_WCHAR_KIND) {
339 assert(ascii->state.compact == 0);
340 assert(ascii->state.ascii == 0);
341 assert(ascii->state.ready == 0);
342 assert(ascii->wstr != NULL);
343 assert(data == NULL);
344 assert(compact->utf8 == NULL);
345 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
346 }
347 else {
348 assert(kind == PyUnicode_1BYTE_KIND
349 || kind == PyUnicode_2BYTE_KIND
350 || kind == PyUnicode_4BYTE_KIND);
351 assert(ascii->state.compact == 0);
352 assert(ascii->state.ready == 1);
353 assert(data != NULL);
354 if (ascii->state.ascii) {
355 assert (compact->utf8 == data);
356 assert (compact->utf8_length == ascii->length);
357 }
358 else
359 assert (compact->utf8 != data);
360 }
361 }
362 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200363 if (
364#if SIZEOF_WCHAR_T == 2
365 kind == PyUnicode_2BYTE_KIND
366#else
367 kind == PyUnicode_4BYTE_KIND
368#endif
369 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200370 {
371 assert(ascii->wstr == data);
372 assert(compact->wstr_length == ascii->length);
373 } else
374 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200375 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200376
377 if (compact->utf8 == NULL)
378 assert(compact->utf8_length == 0);
379 if (ascii->wstr == NULL)
380 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200381 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200382 /* check that the best kind is used */
383 if (check_content && kind != PyUnicode_WCHAR_KIND)
384 {
385 Py_ssize_t i;
386 Py_UCS4 maxchar = 0;
387 void *data = PyUnicode_DATA(ascii);
388 for (i=0; i < ascii->length; i++)
389 {
390 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
391 if (ch > maxchar)
392 maxchar = ch;
393 }
394 if (kind == PyUnicode_1BYTE_KIND) {
395 if (ascii->state.ascii == 0)
396 assert(maxchar >= 128);
397 else
398 assert(maxchar < 128);
399 }
400 else if (kind == PyUnicode_2BYTE_KIND)
401 assert(maxchar >= 0x100);
402 else
403 assert(maxchar >= 0x10000);
404 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200405 if (check_content && !unicode_is_singleton((PyObject*)ascii))
406 assert(ascii->hash == -1);
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400407 return 1;
408}
Victor Stinner910337b2011-10-03 03:20:16 +0200409#endif
410
Thomas Wouters477c8d52006-05-27 19:21:47 +0000411/* --- Bloom Filters ----------------------------------------------------- */
412
413/* stuff to implement simple "bloom filters" for Unicode characters.
414 to keep things simple, we use a single bitmask, using the least 5
415 bits from each unicode characters as the bit index. */
416
417/* the linebreak mask is set up by Unicode_Init below */
418
Antoine Pitrouf068f942010-01-13 14:19:12 +0000419#if LONG_BIT >= 128
420#define BLOOM_WIDTH 128
421#elif LONG_BIT >= 64
422#define BLOOM_WIDTH 64
423#elif LONG_BIT >= 32
424#define BLOOM_WIDTH 32
425#else
426#error "LONG_BIT is smaller than 32"
427#endif
428
Thomas Wouters477c8d52006-05-27 19:21:47 +0000429#define BLOOM_MASK unsigned long
430
431static BLOOM_MASK bloom_linebreak;
432
Antoine Pitrouf068f942010-01-13 14:19:12 +0000433#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
434#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000435
Benjamin Peterson29060642009-01-31 22:14:21 +0000436#define BLOOM_LINEBREAK(ch) \
437 ((ch) < 128U ? ascii_linebreak[(ch)] : \
438 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000439
Alexander Belopolsky40018472011-02-26 01:02:56 +0000440Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200441make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000442{
443 /* calculate simple bloom-style bitmask for a given unicode string */
444
Antoine Pitrouf068f942010-01-13 14:19:12 +0000445 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000446 Py_ssize_t i;
447
448 mask = 0;
449 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200450 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000451
452 return mask;
453}
454
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200455#define BLOOM_MEMBER(mask, chr, str) \
456 (BLOOM(mask, chr) \
457 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000458
Guido van Rossumd57fd912000-03-10 22:53:23 +0000459/* --- Unicode Object ----------------------------------------------------- */
460
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200461static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200462fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200463
464Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
465 Py_ssize_t size, Py_UCS4 ch,
466 int direction)
467{
468 /* like wcschr, but doesn't stop at NULL characters */
469 Py_ssize_t i;
470 if (direction == 1) {
471 for(i = 0; i < size; i++)
472 if (PyUnicode_READ(kind, s, i) == ch)
473 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
474 }
475 else {
476 for(i = size-1; i >= 0; i--)
477 if (PyUnicode_READ(kind, s, i) == ch)
478 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
479 }
480 return NULL;
481}
482
Victor Stinnerfe226c02011-10-03 03:52:20 +0200483static PyObject*
484resize_compact(PyObject *unicode, Py_ssize_t length)
485{
486 Py_ssize_t char_size;
487 Py_ssize_t struct_size;
488 Py_ssize_t new_size;
489 int share_wstr;
490
491 assert(PyUnicode_IS_READY(unicode));
492 char_size = PyUnicode_CHARACTER_SIZE(unicode);
493 if (PyUnicode_IS_COMPACT_ASCII(unicode))
494 struct_size = sizeof(PyASCIIObject);
495 else
496 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200497 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200498
499 _Py_DEC_REFTOTAL;
500 _Py_ForgetReference(unicode);
501
502 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
503 PyErr_NoMemory();
504 return NULL;
505 }
506 new_size = (struct_size + (length + 1) * char_size);
507
508 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
509 if (unicode == NULL) {
510 PyObject_Del(unicode);
511 PyErr_NoMemory();
512 return NULL;
513 }
514 _Py_NewReference(unicode);
515 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200516 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200517 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200518 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
519 _PyUnicode_WSTR_LENGTH(unicode) = length;
520 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200521 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
522 length, 0);
523 return unicode;
524}
525
Alexander Belopolsky40018472011-02-26 01:02:56 +0000526static int
Victor Stinner95663112011-10-04 01:03:50 +0200527resize_inplace(PyUnicodeObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000528{
Victor Stinner95663112011-10-04 01:03:50 +0200529 wchar_t *wstr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200530 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200531 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000532
Victor Stinner95663112011-10-04 01:03:50 +0200533 _PyUnicode_DIRTY(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200534
535 if (PyUnicode_IS_READY(unicode)) {
536 Py_ssize_t char_size;
537 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200538 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200539 void *data;
540
541 data = _PyUnicode_DATA_ANY(unicode);
542 assert(data != NULL);
543 char_size = PyUnicode_CHARACTER_SIZE(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200544 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
545 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinner95663112011-10-04 01:03:50 +0200546 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
547 {
548 PyObject_DEL(_PyUnicode_UTF8(unicode));
549 _PyUnicode_UTF8(unicode) = NULL;
550 _PyUnicode_UTF8_LENGTH(unicode) = 0;
551 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200552
553 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
554 PyErr_NoMemory();
555 return -1;
556 }
557 new_size = (length + 1) * char_size;
558
559 data = (PyObject *)PyObject_REALLOC(data, new_size);
560 if (data == NULL) {
561 PyErr_NoMemory();
562 return -1;
563 }
564 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200565 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200566 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200567 _PyUnicode_WSTR_LENGTH(unicode) = length;
568 }
569 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200570 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200571 _PyUnicode_UTF8_LENGTH(unicode) = length;
572 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200573 _PyUnicode_LENGTH(unicode) = length;
574 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200575 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200576 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200577 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200578 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200579 }
Victor Stinner95663112011-10-04 01:03:50 +0200580 assert(_PyUnicode_WSTR(unicode) != NULL);
581
582 /* check for integer overflow */
583 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
584 PyErr_NoMemory();
585 return -1;
586 }
587 wstr = _PyUnicode_WSTR(unicode);
588 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
589 if (!wstr) {
590 PyErr_NoMemory();
591 return -1;
592 }
593 _PyUnicode_WSTR(unicode) = wstr;
594 _PyUnicode_WSTR(unicode)[length] = 0;
595 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200596 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000597 return 0;
598}
599
Victor Stinnerfe226c02011-10-03 03:52:20 +0200600static PyObject*
601resize_copy(PyObject *unicode, Py_ssize_t length)
602{
603 Py_ssize_t copy_length;
604 if (PyUnicode_IS_COMPACT(unicode)) {
605 PyObject *copy;
606 assert(PyUnicode_IS_READY(unicode));
607
608 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
609 if (copy == NULL)
610 return NULL;
611
612 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200613 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200614 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200615 }
616 else {
Victor Stinner2fd82272011-10-03 04:06:05 +0200617 PyUnicodeObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200618 assert(_PyUnicode_WSTR(unicode) != NULL);
619 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner2fd82272011-10-03 04:06:05 +0200620 w = _PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200621 if (w == NULL)
622 return NULL;
623 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
624 copy_length = Py_MIN(copy_length, length);
625 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
626 copy_length);
627 return (PyObject*)w;
628 }
629}
630
Guido van Rossumd57fd912000-03-10 22:53:23 +0000631/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000632 Ux0000 terminated; some code (e.g. new_identifier)
633 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000634
635 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000636 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000637
638*/
639
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200640#ifdef Py_DEBUG
641int unicode_old_new_calls = 0;
642#endif
643
Alexander Belopolsky40018472011-02-26 01:02:56 +0000644static PyUnicodeObject *
645_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000646{
647 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200648 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000649
Thomas Wouters477c8d52006-05-27 19:21:47 +0000650 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000651 if (length == 0 && unicode_empty != NULL) {
652 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200653 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000654 }
655
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000656 /* Ensure we won't overflow the size. */
657 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
658 return (PyUnicodeObject *)PyErr_NoMemory();
659 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200660 if (length < 0) {
661 PyErr_SetString(PyExc_SystemError,
662 "Negative size passed to _PyUnicode_New");
663 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000664 }
665
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200666#ifdef Py_DEBUG
667 ++unicode_old_new_calls;
668#endif
669
670 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
671 if (unicode == NULL)
672 return NULL;
673 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
674 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
675 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000676 PyErr_NoMemory();
677 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000678 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200679
Jeremy Hyltond8082792003-09-16 19:41:39 +0000680 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000681 * the caller fails before initializing str -- unicode_resize()
682 * reads str[0], and the Keep-Alive optimization can keep memory
683 * allocated for str alive across a call to unicode_dealloc(unicode).
684 * We don't want unicode_resize to read uninitialized memory in
685 * that case.
686 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200687 _PyUnicode_WSTR(unicode)[0] = 0;
688 _PyUnicode_WSTR(unicode)[length] = 0;
689 _PyUnicode_WSTR_LENGTH(unicode) = length;
690 _PyUnicode_HASH(unicode) = -1;
691 _PyUnicode_STATE(unicode).interned = 0;
692 _PyUnicode_STATE(unicode).kind = 0;
693 _PyUnicode_STATE(unicode).compact = 0;
694 _PyUnicode_STATE(unicode).ready = 0;
695 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200696 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200697 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200698 _PyUnicode_UTF8(unicode) = NULL;
699 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000700 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000701
Benjamin Peterson29060642009-01-31 22:14:21 +0000702 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000703 /* XXX UNREF/NEWREF interface should be more symmetrical */
704 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000705 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000706 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000707 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000708}
709
Victor Stinnerf42dc442011-10-02 23:33:16 +0200710static const char*
711unicode_kind_name(PyObject *unicode)
712{
Victor Stinner42dfd712011-10-03 14:41:45 +0200713 /* don't check consistency: unicode_kind_name() is called from
714 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200715 if (!PyUnicode_IS_COMPACT(unicode))
716 {
717 if (!PyUnicode_IS_READY(unicode))
718 return "wstr";
719 switch(PyUnicode_KIND(unicode))
720 {
721 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200722 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200723 return "legacy ascii";
724 else
725 return "legacy latin1";
726 case PyUnicode_2BYTE_KIND:
727 return "legacy UCS2";
728 case PyUnicode_4BYTE_KIND:
729 return "legacy UCS4";
730 default:
731 return "<legacy invalid kind>";
732 }
733 }
734 assert(PyUnicode_IS_READY(unicode));
735 switch(PyUnicode_KIND(unicode))
736 {
737 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200738 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200739 return "ascii";
740 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200741 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200742 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200743 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200744 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200745 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200746 default:
747 return "<invalid compact kind>";
748 }
749}
750
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200751#ifdef Py_DEBUG
752int unicode_new_new_calls = 0;
753
754/* Functions wrapping macros for use in debugger */
755char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200756 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200757}
758
759void *_PyUnicode_compact_data(void *unicode) {
760 return _PyUnicode_COMPACT_DATA(unicode);
761}
762void *_PyUnicode_data(void *unicode){
763 printf("obj %p\n", unicode);
764 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
765 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
766 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
767 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
768 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
769 return PyUnicode_DATA(unicode);
770}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200771
772void
773_PyUnicode_Dump(PyObject *op)
774{
775 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200776 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
777 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
778 void *data;
779 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
780 if (ascii->state.compact)
781 data = (compact + 1);
782 else
783 data = unicode->data.any;
784 if (ascii->wstr == data)
785 printf("shared ");
786 printf("wstr=%p", ascii->wstr);
Victor Stinnera3b334d2011-10-03 13:53:37 +0200787 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200788 printf(" (%zu), ", compact->wstr_length);
789 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
790 printf("shared ");
791 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200792 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200793 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200794}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200795#endif
796
797PyObject *
798PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
799{
800 PyObject *obj;
801 PyCompactUnicodeObject *unicode;
802 void *data;
803 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200804 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200805 Py_ssize_t char_size;
806 Py_ssize_t struct_size;
807
808 /* Optimization for empty strings */
809 if (size == 0 && unicode_empty != NULL) {
810 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200811 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200812 }
813
814#ifdef Py_DEBUG
815 ++unicode_new_new_calls;
816#endif
817
Victor Stinner9e9d6892011-10-04 01:02:02 +0200818 is_ascii = 0;
819 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200820 struct_size = sizeof(PyCompactUnicodeObject);
821 if (maxchar < 128) {
822 kind_state = PyUnicode_1BYTE_KIND;
823 char_size = 1;
824 is_ascii = 1;
825 struct_size = sizeof(PyASCIIObject);
826 }
827 else if (maxchar < 256) {
828 kind_state = PyUnicode_1BYTE_KIND;
829 char_size = 1;
830 }
831 else if (maxchar < 65536) {
832 kind_state = PyUnicode_2BYTE_KIND;
833 char_size = 2;
834 if (sizeof(wchar_t) == 2)
835 is_sharing = 1;
836 }
837 else {
838 kind_state = PyUnicode_4BYTE_KIND;
839 char_size = 4;
840 if (sizeof(wchar_t) == 4)
841 is_sharing = 1;
842 }
843
844 /* Ensure we won't overflow the size. */
845 if (size < 0) {
846 PyErr_SetString(PyExc_SystemError,
847 "Negative size passed to PyUnicode_New");
848 return NULL;
849 }
850 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
851 return PyErr_NoMemory();
852
853 /* Duplicated allocation code from _PyObject_New() instead of a call to
854 * PyObject_New() so we are able to allocate space for the object and
855 * it's data buffer.
856 */
857 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
858 if (obj == NULL)
859 return PyErr_NoMemory();
860 obj = PyObject_INIT(obj, &PyUnicode_Type);
861 if (obj == NULL)
862 return NULL;
863
864 unicode = (PyCompactUnicodeObject *)obj;
865 if (is_ascii)
866 data = ((PyASCIIObject*)obj) + 1;
867 else
868 data = unicode + 1;
869 _PyUnicode_LENGTH(unicode) = size;
870 _PyUnicode_HASH(unicode) = -1;
871 _PyUnicode_STATE(unicode).interned = 0;
872 _PyUnicode_STATE(unicode).kind = kind_state;
873 _PyUnicode_STATE(unicode).compact = 1;
874 _PyUnicode_STATE(unicode).ready = 1;
875 _PyUnicode_STATE(unicode).ascii = is_ascii;
876 if (is_ascii) {
877 ((char*)data)[size] = 0;
878 _PyUnicode_WSTR(unicode) = NULL;
879 }
880 else if (kind_state == PyUnicode_1BYTE_KIND) {
881 ((char*)data)[size] = 0;
882 _PyUnicode_WSTR(unicode) = NULL;
883 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200884 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200885 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200886 }
887 else {
888 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200889 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200890 if (kind_state == PyUnicode_2BYTE_KIND)
891 ((Py_UCS2*)data)[size] = 0;
892 else /* kind_state == PyUnicode_4BYTE_KIND */
893 ((Py_UCS4*)data)[size] = 0;
894 if (is_sharing) {
895 _PyUnicode_WSTR_LENGTH(unicode) = size;
896 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
897 }
898 else {
899 _PyUnicode_WSTR_LENGTH(unicode) = 0;
900 _PyUnicode_WSTR(unicode) = NULL;
901 }
902 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200903 assert(_PyUnicode_CheckConsistency(unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200904 return obj;
905}
906
907#if SIZEOF_WCHAR_T == 2
908/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
909 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +0200910 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200911
912 This function assumes that unicode can hold one more code point than wstr
913 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +0200914static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200915unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
916 PyUnicodeObject *unicode)
917{
918 const wchar_t *iter;
919 Py_UCS4 *ucs4_out;
920
Victor Stinner910337b2011-10-03 03:20:16 +0200921 assert(unicode != NULL);
922 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200923 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
924 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
925
926 for (iter = begin; iter < end; ) {
927 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
928 _PyUnicode_GET_LENGTH(unicode)));
929 if (*iter >= 0xD800 && *iter <= 0xDBFF
930 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
931 {
932 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
933 iter += 2;
934 }
935 else {
936 *ucs4_out++ = *iter;
937 iter++;
938 }
939 }
940 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
941 _PyUnicode_GET_LENGTH(unicode)));
942
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200943}
944#endif
945
Victor Stinnercd9950f2011-10-02 00:34:53 +0200946static int
947_PyUnicode_Dirty(PyObject *unicode)
948{
Victor Stinner910337b2011-10-03 03:20:16 +0200949 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +0200950 if (Py_REFCNT(unicode) != 1) {
Victor Stinner01698042011-10-04 00:04:26 +0200951 PyErr_SetString(PyExc_SystemError,
Victor Stinnercd9950f2011-10-02 00:34:53 +0200952 "Cannot modify a string having more than 1 reference");
953 return -1;
954 }
955 _PyUnicode_DIRTY(unicode);
956 return 0;
957}
958
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200959static int
960_copy_characters(PyObject *to, Py_ssize_t to_start,
961 PyObject *from, Py_ssize_t from_start,
962 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200963{
Victor Stinnera0702ab2011-09-29 14:14:38 +0200964 unsigned int from_kind, to_kind;
965 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200966 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200967
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200968 assert(PyUnicode_Check(from));
969 assert(PyUnicode_Check(to));
970 assert(PyUnicode_IS_READY(from));
971 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200972
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200973 assert(PyUnicode_GET_LENGTH(from) >= how_many);
974 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
975 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200976
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200977 if (how_many == 0)
978 return 0;
979
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200980 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200981 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200982 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200983 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200984
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200985#ifdef Py_DEBUG
986 if (!check_maxchar
987 && (from_kind > to_kind
988 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200989 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200990 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
991 Py_UCS4 ch;
992 Py_ssize_t i;
993 for (i=0; i < how_many; i++) {
994 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
995 assert(ch <= to_maxchar);
996 }
997 }
998#endif
999 fast = (from_kind == to_kind);
1000 if (check_maxchar
1001 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1002 {
1003 /* deny latin1 => ascii */
1004 fast = 0;
1005 }
1006
1007 if (fast) {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001008 Py_MEMCPY((char*)to_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001009 + PyUnicode_KIND_SIZE(to_kind, to_start),
Victor Stinnera0702ab2011-09-29 14:14:38 +02001010 (char*)from_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001011 + PyUnicode_KIND_SIZE(from_kind, from_start),
1012 PyUnicode_KIND_SIZE(to_kind, how_many));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001013 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001014 else if (from_kind == PyUnicode_1BYTE_KIND
1015 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001016 {
1017 _PyUnicode_CONVERT_BYTES(
1018 Py_UCS1, Py_UCS2,
1019 PyUnicode_1BYTE_DATA(from) + from_start,
1020 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1021 PyUnicode_2BYTE_DATA(to) + to_start
1022 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001023 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001024 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001025 && to_kind == PyUnicode_4BYTE_KIND)
1026 {
1027 _PyUnicode_CONVERT_BYTES(
1028 Py_UCS1, Py_UCS4,
1029 PyUnicode_1BYTE_DATA(from) + from_start,
1030 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1031 PyUnicode_4BYTE_DATA(to) + to_start
1032 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001033 }
1034 else if (from_kind == PyUnicode_2BYTE_KIND
1035 && to_kind == PyUnicode_4BYTE_KIND)
1036 {
1037 _PyUnicode_CONVERT_BYTES(
1038 Py_UCS2, Py_UCS4,
1039 PyUnicode_2BYTE_DATA(from) + from_start,
1040 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1041 PyUnicode_4BYTE_DATA(to) + to_start
1042 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001043 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001044 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001045 /* check if max_char(from substring) <= max_char(to) */
1046 if (from_kind > to_kind
1047 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001048 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001049 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001050 /* slow path to check for character overflow */
1051 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001052 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001053 Py_ssize_t i;
1054
Victor Stinnera0702ab2011-09-29 14:14:38 +02001055 for (i=0; i < how_many; i++) {
1056 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001057 if (check_maxchar) {
1058 if (ch > to_maxchar)
1059 return 1;
1060 }
1061 else {
1062 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001063 }
1064 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1065 }
1066 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001067 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001068 return -1;
1069 }
1070 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001071 return 0;
1072}
1073
1074static void
1075copy_characters(PyObject *to, Py_ssize_t to_start,
1076 PyObject *from, Py_ssize_t from_start,
1077 Py_ssize_t how_many)
1078{
1079 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1080}
1081
1082Py_ssize_t
1083PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1084 PyObject *from, Py_ssize_t from_start,
1085 Py_ssize_t how_many)
1086{
1087 int err;
1088
1089 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1090 PyErr_BadInternalCall();
1091 return -1;
1092 }
1093
1094 if (PyUnicode_READY(from))
1095 return -1;
1096 if (PyUnicode_READY(to))
1097 return -1;
1098
1099 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1100 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1101 PyErr_Format(PyExc_SystemError,
1102 "Cannot write %zi characters at %zi "
1103 "in a string of %zi characters",
1104 how_many, to_start, PyUnicode_GET_LENGTH(to));
1105 return -1;
1106 }
1107
1108 if (how_many == 0)
1109 return 0;
1110
1111 if (_PyUnicode_Dirty(to))
1112 return -1;
1113
1114 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1115 if (err) {
1116 PyErr_Format(PyExc_SystemError,
1117 "Cannot copy %s characters "
1118 "into a string of %s characters",
1119 unicode_kind_name(from),
1120 unicode_kind_name(to));
1121 return -1;
1122 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001123 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001124}
1125
Victor Stinner17222162011-09-28 22:15:37 +02001126/* Find the maximum code point and count the number of surrogate pairs so a
1127 correct string length can be computed before converting a string to UCS4.
1128 This function counts single surrogates as a character and not as a pair.
1129
1130 Return 0 on success, or -1 on error. */
1131static int
1132find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1133 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001134{
1135 const wchar_t *iter;
1136
Victor Stinnerc53be962011-10-02 21:33:54 +02001137 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001138 *num_surrogates = 0;
1139 *maxchar = 0;
1140
1141 for (iter = begin; iter < end; ) {
Victor Stinnerae864852011-10-05 14:02:44 +02001142 if (*iter > *maxchar) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001143 *maxchar = *iter;
Victor Stinnerae864852011-10-05 14:02:44 +02001144#if SIZEOF_WCHAR_T != 2
1145 if (*maxchar >= 0x10000)
1146 return 0;
1147#endif
1148 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001149#if SIZEOF_WCHAR_T == 2
1150 if (*iter >= 0xD800 && *iter <= 0xDBFF
1151 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1152 {
1153 Py_UCS4 surrogate_val;
1154 surrogate_val = (((iter[0] & 0x3FF)<<10)
1155 | (iter[1] & 0x3FF)) + 0x10000;
1156 ++(*num_surrogates);
1157 if (surrogate_val > *maxchar)
1158 *maxchar = surrogate_val;
1159 iter += 2;
1160 }
1161 else
1162 iter++;
1163#else
1164 iter++;
1165#endif
1166 }
1167 return 0;
1168}
1169
1170#ifdef Py_DEBUG
1171int unicode_ready_calls = 0;
1172#endif
1173
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001174static int
1175unicode_ready(PyObject **p_obj, int replace)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001176{
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001177 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001178 wchar_t *end;
1179 Py_UCS4 maxchar = 0;
1180 Py_ssize_t num_surrogates;
1181#if SIZEOF_WCHAR_T == 2
1182 Py_ssize_t length_wo_surrogates;
1183#endif
1184
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001185 assert(p_obj != NULL);
1186 unicode = (PyUnicodeObject *)*p_obj;
1187
Georg Brandl7597add2011-10-05 16:36:47 +02001188 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001189 strings were created using _PyObject_New() and where no canonical
1190 representation (the str field) has been set yet aka strings
1191 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001192 assert(_PyUnicode_CHECK(unicode));
1193 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001194 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001195 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001196 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001197 /* Actually, it should neither be interned nor be anything else: */
1198 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001199
1200#ifdef Py_DEBUG
1201 ++unicode_ready_calls;
1202#endif
1203
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001204#ifdef Py_DEBUG
1205 assert(!replace || Py_REFCNT(unicode) == 1);
1206#else
1207 if (replace && Py_REFCNT(unicode) != 1)
1208 replace = 0;
1209#endif
1210 if (replace) {
1211 Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode);
1212 wchar_t *wstr = _PyUnicode_WSTR(unicode);
1213 /* Optimization for empty strings */
1214 if (len == 0) {
1215 Py_INCREF(unicode_empty);
1216 Py_DECREF(*p_obj);
1217 *p_obj = unicode_empty;
1218 return 0;
1219 }
1220 if (len == 1 && wstr[0] < 256) {
1221 PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]);
1222 if (latin1_char == NULL)
1223 return -1;
1224 Py_DECREF(*p_obj);
1225 *p_obj = latin1_char;
1226 return 0;
1227 }
1228 }
1229
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001230 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001231 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001232 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001233 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001234
1235 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001236 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1237 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001238 PyErr_NoMemory();
1239 return -1;
1240 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001241 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001242 _PyUnicode_WSTR(unicode), end,
1243 PyUnicode_1BYTE_DATA(unicode));
1244 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1245 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1246 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1247 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001248 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001249 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001250 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001251 }
1252 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001253 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001254 _PyUnicode_UTF8(unicode) = NULL;
1255 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001256 }
1257 PyObject_FREE(_PyUnicode_WSTR(unicode));
1258 _PyUnicode_WSTR(unicode) = NULL;
1259 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1260 }
1261 /* In this case we might have to convert down from 4-byte native
1262 wchar_t to 2-byte unicode. */
1263 else if (maxchar < 65536) {
1264 assert(num_surrogates == 0 &&
1265 "FindMaxCharAndNumSurrogatePairs() messed up");
1266
Victor Stinner506f5922011-09-28 22:34:18 +02001267#if SIZEOF_WCHAR_T == 2
1268 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001269 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001270 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1271 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1272 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001273 _PyUnicode_UTF8(unicode) = NULL;
1274 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001275#else
1276 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001277 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001278 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001279 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001280 PyErr_NoMemory();
1281 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001282 }
Victor Stinner506f5922011-09-28 22:34:18 +02001283 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1284 _PyUnicode_WSTR(unicode), end,
1285 PyUnicode_2BYTE_DATA(unicode));
1286 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1287 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1288 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001289 _PyUnicode_UTF8(unicode) = NULL;
1290 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001291 PyObject_FREE(_PyUnicode_WSTR(unicode));
1292 _PyUnicode_WSTR(unicode) = NULL;
1293 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1294#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001295 }
1296 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1297 else {
1298#if SIZEOF_WCHAR_T == 2
1299 /* in case the native representation is 2-bytes, we need to allocate a
1300 new normalized 4-byte version. */
1301 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001302 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1303 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001304 PyErr_NoMemory();
1305 return -1;
1306 }
1307 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1308 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001309 _PyUnicode_UTF8(unicode) = NULL;
1310 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001311 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1312 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001313 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001314 PyObject_FREE(_PyUnicode_WSTR(unicode));
1315 _PyUnicode_WSTR(unicode) = NULL;
1316 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1317#else
1318 assert(num_surrogates == 0);
1319
Victor Stinnerc3c74152011-10-02 20:39:55 +02001320 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001321 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001322 _PyUnicode_UTF8(unicode) = NULL;
1323 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001324 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1325#endif
1326 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1327 }
1328 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001329 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001330 return 0;
1331}
1332
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001333int
1334_PyUnicode_ReadyReplace(PyObject **op)
1335{
1336 return unicode_ready(op, 1);
1337}
1338
1339int
1340_PyUnicode_Ready(PyObject *op)
1341{
1342 return unicode_ready(&op, 0);
1343}
1344
Alexander Belopolsky40018472011-02-26 01:02:56 +00001345static void
1346unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001347{
Walter Dörwald16807132007-05-25 13:52:07 +00001348 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001349 case SSTATE_NOT_INTERNED:
1350 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001351
Benjamin Peterson29060642009-01-31 22:14:21 +00001352 case SSTATE_INTERNED_MORTAL:
1353 /* revive dead object temporarily for DelItem */
1354 Py_REFCNT(unicode) = 3;
1355 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
1356 Py_FatalError(
1357 "deletion of interned string failed");
1358 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001359
Benjamin Peterson29060642009-01-31 22:14:21 +00001360 case SSTATE_INTERNED_IMMORTAL:
1361 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001362
Benjamin Peterson29060642009-01-31 22:14:21 +00001363 default:
1364 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001365 }
1366
Victor Stinner03490912011-10-03 23:45:12 +02001367 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001368 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001369 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001370 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001371
1372 if (PyUnicode_IS_COMPACT(unicode)) {
1373 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001374 }
1375 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001376 if (_PyUnicode_DATA_ANY(unicode))
1377 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Benjamin Peterson29060642009-01-31 22:14:21 +00001378 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001379 }
1380}
1381
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001382#ifdef Py_DEBUG
1383static int
1384unicode_is_singleton(PyObject *unicode)
1385{
1386 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1387 if (unicode == unicode_empty)
1388 return 1;
1389 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1390 {
1391 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1392 if (ch < 256 && unicode_latin1[ch] == unicode)
1393 return 1;
1394 }
1395 return 0;
1396}
1397#endif
1398
Alexander Belopolsky40018472011-02-26 01:02:56 +00001399static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001400unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001401{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001402 if (Py_REFCNT(unicode) != 1)
1403 return 0;
1404 if (PyUnicode_CHECK_INTERNED(unicode))
1405 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001406#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001407 /* singleton refcount is greater than 1 */
1408 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001409#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001410 return 1;
1411}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001412
Victor Stinnerfe226c02011-10-03 03:52:20 +02001413static int
1414unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1415{
1416 PyObject *unicode;
1417 Py_ssize_t old_length;
1418
1419 assert(p_unicode != NULL);
1420 unicode = *p_unicode;
1421
1422 assert(unicode != NULL);
1423 assert(PyUnicode_Check(unicode));
1424 assert(0 <= length);
1425
Victor Stinner910337b2011-10-03 03:20:16 +02001426 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001427 old_length = PyUnicode_WSTR_LENGTH(unicode);
1428 else
1429 old_length = PyUnicode_GET_LENGTH(unicode);
1430 if (old_length == length)
1431 return 0;
1432
Victor Stinnerfe226c02011-10-03 03:52:20 +02001433 if (!unicode_resizable(unicode)) {
1434 PyObject *copy = resize_copy(unicode, length);
1435 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001436 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001437 Py_DECREF(*p_unicode);
1438 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001439 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001440 }
1441
Victor Stinnerfe226c02011-10-03 03:52:20 +02001442 if (PyUnicode_IS_COMPACT(unicode)) {
1443 *p_unicode = resize_compact(unicode, length);
1444 if (*p_unicode == NULL)
1445 return -1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001446 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001447 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001448 }
1449 return resize_inplace((PyUnicodeObject*)unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001450}
1451
Alexander Belopolsky40018472011-02-26 01:02:56 +00001452int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001453PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001454{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001455 PyObject *unicode;
1456 if (p_unicode == NULL) {
1457 PyErr_BadInternalCall();
1458 return -1;
1459 }
1460 unicode = *p_unicode;
1461 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0
1462 || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND)
1463 {
1464 PyErr_BadInternalCall();
1465 return -1;
1466 }
1467 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001468}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001469
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001470static PyObject*
1471get_latin1_char(unsigned char ch)
1472{
Victor Stinnera464fc12011-10-02 20:39:30 +02001473 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001474 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001475 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001476 if (!unicode)
1477 return NULL;
1478 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001479 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001480 unicode_latin1[ch] = unicode;
1481 }
1482 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001483 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001484}
1485
Alexander Belopolsky40018472011-02-26 01:02:56 +00001486PyObject *
1487PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001488{
1489 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001490 Py_UCS4 maxchar = 0;
1491 Py_ssize_t num_surrogates;
1492
1493 if (u == NULL)
1494 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001495
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001496 /* If the Unicode data is known at construction time, we can apply
1497 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001498
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001499 /* Optimization for empty strings */
1500 if (size == 0 && unicode_empty != NULL) {
1501 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001502 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001503 }
Tim Petersced69f82003-09-16 20:30:58 +00001504
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001505 /* Single character Unicode objects in the Latin-1 range are
1506 shared when using this constructor */
1507 if (size == 1 && *u < 256)
1508 return get_latin1_char((unsigned char)*u);
1509
1510 /* If not empty and not single character, copy the Unicode data
1511 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001512 if (find_maxchar_surrogates(u, u + size,
1513 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001514 return NULL;
1515
1516 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1517 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001518 if (!unicode)
1519 return NULL;
1520
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001521 switch (PyUnicode_KIND(unicode)) {
1522 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001523 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001524 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1525 break;
1526 case PyUnicode_2BYTE_KIND:
1527#if Py_UNICODE_SIZE == 2
1528 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1529#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001530 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001531 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1532#endif
1533 break;
1534 case PyUnicode_4BYTE_KIND:
1535#if SIZEOF_WCHAR_T == 2
1536 /* This is the only case which has to process surrogates, thus
1537 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001538 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001539#else
1540 assert(num_surrogates == 0);
1541 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1542#endif
1543 break;
1544 default:
1545 assert(0 && "Impossible state");
1546 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001547
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001548 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001549 return (PyObject *)unicode;
1550}
1551
Alexander Belopolsky40018472011-02-26 01:02:56 +00001552PyObject *
1553PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001554{
1555 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001556
Benjamin Peterson14339b62009-01-31 16:36:08 +00001557 if (size < 0) {
1558 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001559 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001560 return NULL;
1561 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001562
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001563 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001564 some optimizations which share commonly used objects.
1565 Also, this means the input must be UTF-8, so fall back to the
1566 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001567 if (u != NULL) {
1568
Benjamin Peterson29060642009-01-31 22:14:21 +00001569 /* Optimization for empty strings */
1570 if (size == 0 && unicode_empty != NULL) {
1571 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001572 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001573 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001574
1575 /* Single characters are shared when using this constructor.
1576 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001577 if (size == 1 && Py_CHARMASK(*u) < 128)
1578 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001579
1580 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001581 }
1582
Walter Dörwald55507312007-05-18 13:12:10 +00001583 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001584 if (!unicode)
1585 return NULL;
1586
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001587 return (PyObject *)unicode;
1588}
1589
Alexander Belopolsky40018472011-02-26 01:02:56 +00001590PyObject *
1591PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001592{
1593 size_t size = strlen(u);
1594 if (size > PY_SSIZE_T_MAX) {
1595 PyErr_SetString(PyExc_OverflowError, "input too long");
1596 return NULL;
1597 }
1598
1599 return PyUnicode_FromStringAndSize(u, size);
1600}
1601
Victor Stinnere57b1c02011-09-28 22:20:48 +02001602static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001603unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001604{
Victor Stinner0617b6e2011-10-05 23:26:01 +02001605 PyObject *res;
1606#ifdef Py_DEBUG
1607 const unsigned char *p;
1608 const unsigned char *end = s + size;
1609 for (p=s; p < end; p++) {
1610 assert(*p < 128);
1611 }
1612#endif
1613 res = PyUnicode_New(size, 127);
Victor Stinner702c7342011-10-05 13:50:52 +02001614 if (!res)
1615 return NULL;
Victor Stinner0617b6e2011-10-05 23:26:01 +02001616 memcpy(PyUnicode_1BYTE_DATA(res), s, size);
Victor Stinner702c7342011-10-05 13:50:52 +02001617 return res;
1618}
1619
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001620static Py_UCS4
1621kind_maxchar_limit(unsigned int kind)
1622{
1623 switch(kind) {
1624 case PyUnicode_1BYTE_KIND:
1625 return 0x80;
1626 case PyUnicode_2BYTE_KIND:
1627 return 0x100;
1628 case PyUnicode_4BYTE_KIND:
1629 return 0x10000;
1630 default:
1631 assert(0 && "invalid kind");
1632 return 0x10ffff;
1633 }
1634}
1635
Victor Stinner702c7342011-10-05 13:50:52 +02001636static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001637_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001638{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001639 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001640 unsigned char max_char = 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001641 Py_ssize_t i;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001642
1643 assert(size >= 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001644 for (i = 0; i < size; i++) {
1645 if (u[i] & 0x80) {
Victor Stinnerb9275c12011-10-05 14:01:42 +02001646 max_char = 255;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001647 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001648 }
1649 }
Victor Stinnerb9275c12011-10-05 14:01:42 +02001650 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001651 if (!res)
1652 return NULL;
1653 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001654 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001655 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001656}
1657
Victor Stinnere57b1c02011-09-28 22:20:48 +02001658static PyObject*
1659_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001660{
1661 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001662 Py_UCS2 max_char = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001663 Py_ssize_t i;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001664
1665 assert(size >= 0);
1666 for (i = 0; i < size; i++) {
1667 if (u[i] > max_char) {
1668 max_char = u[i];
1669 if (max_char >= 256)
1670 break;
1671 }
1672 }
1673 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001674 if (!res)
1675 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001676 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001677 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1678 else
1679 for (i = 0; i < size; i++)
1680 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001681 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001682 return res;
1683}
1684
Victor Stinnere57b1c02011-09-28 22:20:48 +02001685static PyObject*
1686_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001687{
1688 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001689 Py_UCS4 max_char = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001690 Py_ssize_t i;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001691
1692 assert(size >= 0);
1693 for (i = 0; i < size; i++) {
1694 if (u[i] > max_char) {
1695 max_char = u[i];
1696 if (max_char >= 0x10000)
1697 break;
1698 }
1699 }
1700 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001701 if (!res)
1702 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001703 if (max_char >= 0x10000)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001704 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1705 else {
1706 int kind = PyUnicode_KIND(res);
1707 void *data = PyUnicode_DATA(res);
1708 for (i = 0; i < size; i++)
1709 PyUnicode_WRITE(kind, data, i, u[i]);
1710 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001711 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001712 return res;
1713}
1714
1715PyObject*
1716PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1717{
1718 switch(kind) {
1719 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001720 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001721 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001722 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001723 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001724 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001725 default:
1726 assert(0 && "invalid kind");
1727 PyErr_SetString(PyExc_SystemError, "invalid kind");
1728 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001729 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001730}
1731
Victor Stinner034f6cf2011-09-30 02:26:44 +02001732PyObject*
1733PyUnicode_Copy(PyObject *unicode)
1734{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001735 Py_ssize_t size;
1736 PyObject *copy;
1737 void *data;
1738
Victor Stinner034f6cf2011-09-30 02:26:44 +02001739 if (!PyUnicode_Check(unicode)) {
1740 PyErr_BadInternalCall();
1741 return NULL;
1742 }
1743 if (PyUnicode_READY(unicode))
1744 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001745
1746 size = PyUnicode_GET_LENGTH(unicode);
1747 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1748 if (!copy)
1749 return NULL;
1750 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1751
1752 data = PyUnicode_DATA(unicode);
1753 switch (PyUnicode_KIND(unicode))
1754 {
1755 case PyUnicode_1BYTE_KIND:
1756 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1757 break;
1758 case PyUnicode_2BYTE_KIND:
1759 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1760 break;
1761 case PyUnicode_4BYTE_KIND:
1762 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1763 break;
1764 default:
1765 assert(0);
1766 break;
1767 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001768 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001769 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001770}
1771
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001772
Victor Stinnerbc603d12011-10-02 01:00:40 +02001773/* Widen Unicode objects to larger buffers. Don't write terminating null
1774 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001775
1776void*
1777_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1778{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001779 Py_ssize_t len;
1780 void *result;
1781 unsigned int skind;
1782
1783 if (PyUnicode_READY(s))
1784 return NULL;
1785
1786 len = PyUnicode_GET_LENGTH(s);
1787 skind = PyUnicode_KIND(s);
1788 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02001789 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001790 return NULL;
1791 }
1792 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001793 case PyUnicode_2BYTE_KIND:
1794 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1795 if (!result)
1796 return PyErr_NoMemory();
1797 assert(skind == PyUnicode_1BYTE_KIND);
1798 _PyUnicode_CONVERT_BYTES(
1799 Py_UCS1, Py_UCS2,
1800 PyUnicode_1BYTE_DATA(s),
1801 PyUnicode_1BYTE_DATA(s) + len,
1802 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001803 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001804 case PyUnicode_4BYTE_KIND:
1805 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1806 if (!result)
1807 return PyErr_NoMemory();
1808 if (skind == PyUnicode_2BYTE_KIND) {
1809 _PyUnicode_CONVERT_BYTES(
1810 Py_UCS2, Py_UCS4,
1811 PyUnicode_2BYTE_DATA(s),
1812 PyUnicode_2BYTE_DATA(s) + len,
1813 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001814 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001815 else {
1816 assert(skind == PyUnicode_1BYTE_KIND);
1817 _PyUnicode_CONVERT_BYTES(
1818 Py_UCS1, Py_UCS4,
1819 PyUnicode_1BYTE_DATA(s),
1820 PyUnicode_1BYTE_DATA(s) + len,
1821 result);
1822 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001823 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001824 default:
1825 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001826 }
Victor Stinner01698042011-10-04 00:04:26 +02001827 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001828 return NULL;
1829}
1830
1831static Py_UCS4*
1832as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1833 int copy_null)
1834{
1835 int kind;
1836 void *data;
1837 Py_ssize_t len, targetlen;
1838 if (PyUnicode_READY(string) == -1)
1839 return NULL;
1840 kind = PyUnicode_KIND(string);
1841 data = PyUnicode_DATA(string);
1842 len = PyUnicode_GET_LENGTH(string);
1843 targetlen = len;
1844 if (copy_null)
1845 targetlen++;
1846 if (!target) {
1847 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1848 PyErr_NoMemory();
1849 return NULL;
1850 }
1851 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1852 if (!target) {
1853 PyErr_NoMemory();
1854 return NULL;
1855 }
1856 }
1857 else {
1858 if (targetsize < targetlen) {
1859 PyErr_Format(PyExc_SystemError,
1860 "string is longer than the buffer");
1861 if (copy_null && 0 < targetsize)
1862 target[0] = 0;
1863 return NULL;
1864 }
1865 }
1866 if (kind != PyUnicode_4BYTE_KIND) {
1867 Py_ssize_t i;
1868 for (i = 0; i < len; i++)
1869 target[i] = PyUnicode_READ(kind, data, i);
1870 }
1871 else
1872 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1873 if (copy_null)
1874 target[len] = 0;
1875 return target;
1876}
1877
1878Py_UCS4*
1879PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1880 int copy_null)
1881{
1882 if (target == NULL || targetsize < 1) {
1883 PyErr_BadInternalCall();
1884 return NULL;
1885 }
1886 return as_ucs4(string, target, targetsize, copy_null);
1887}
1888
1889Py_UCS4*
1890PyUnicode_AsUCS4Copy(PyObject *string)
1891{
1892 return as_ucs4(string, NULL, 0, 1);
1893}
1894
1895#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00001896
Alexander Belopolsky40018472011-02-26 01:02:56 +00001897PyObject *
1898PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001899{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001900 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00001901 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001902 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001903 PyErr_BadInternalCall();
1904 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001905 }
1906
Martin v. Löwis790465f2008-04-05 20:41:37 +00001907 if (size == -1) {
1908 size = wcslen(w);
1909 }
1910
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001911 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001912}
1913
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001914#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00001915
Walter Dörwald346737f2007-05-31 10:44:43 +00001916static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001917makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1918 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00001919{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001920 *fmt++ = '%';
1921 if (width) {
1922 if (zeropad)
1923 *fmt++ = '0';
1924 fmt += sprintf(fmt, "%d", width);
1925 }
1926 if (precision)
1927 fmt += sprintf(fmt, ".%d", precision);
1928 if (longflag)
1929 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001930 else if (longlongflag) {
1931 /* longlongflag should only ever be nonzero on machines with
1932 HAVE_LONG_LONG defined */
1933#ifdef HAVE_LONG_LONG
1934 char *f = PY_FORMAT_LONG_LONG;
1935 while (*f)
1936 *fmt++ = *f++;
1937#else
1938 /* we shouldn't ever get here */
1939 assert(0);
1940 *fmt++ = 'l';
1941#endif
1942 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001943 else if (size_tflag) {
1944 char *f = PY_FORMAT_SIZE_T;
1945 while (*f)
1946 *fmt++ = *f++;
1947 }
1948 *fmt++ = c;
1949 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00001950}
1951
Victor Stinner96865452011-03-01 23:44:09 +00001952/* helper for PyUnicode_FromFormatV() */
1953
1954static const char*
1955parse_format_flags(const char *f,
1956 int *p_width, int *p_precision,
1957 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1958{
1959 int width, precision, longflag, longlongflag, size_tflag;
1960
1961 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1962 f++;
1963 width = 0;
1964 while (Py_ISDIGIT((unsigned)*f))
1965 width = (width*10) + *f++ - '0';
1966 precision = 0;
1967 if (*f == '.') {
1968 f++;
1969 while (Py_ISDIGIT((unsigned)*f))
1970 precision = (precision*10) + *f++ - '0';
1971 if (*f == '%') {
1972 /* "%.3%s" => f points to "3" */
1973 f--;
1974 }
1975 }
1976 if (*f == '\0') {
1977 /* bogus format "%.1" => go backward, f points to "1" */
1978 f--;
1979 }
1980 if (p_width != NULL)
1981 *p_width = width;
1982 if (p_precision != NULL)
1983 *p_precision = precision;
1984
1985 /* Handle %ld, %lu, %lld and %llu. */
1986 longflag = 0;
1987 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00001988 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00001989
1990 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00001991 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00001992 longflag = 1;
1993 ++f;
1994 }
1995#ifdef HAVE_LONG_LONG
1996 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00001997 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001998 longlongflag = 1;
1999 f += 2;
2000 }
2001#endif
2002 }
2003 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002004 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002005 size_tflag = 1;
2006 ++f;
2007 }
2008 if (p_longflag != NULL)
2009 *p_longflag = longflag;
2010 if (p_longlongflag != NULL)
2011 *p_longlongflag = longlongflag;
2012 if (p_size_tflag != NULL)
2013 *p_size_tflag = size_tflag;
2014 return f;
2015}
2016
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002017/* maximum number of characters required for output of %ld. 21 characters
2018 allows for 64-bit integers (in decimal) and an optional sign. */
2019#define MAX_LONG_CHARS 21
2020/* maximum number of characters required for output of %lld.
2021 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2022 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2023#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2024
Walter Dörwaldd2034312007-05-18 16:29:38 +00002025PyObject *
2026PyUnicode_FromFormatV(const char *format, va_list vargs)
2027{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002028 va_list count;
2029 Py_ssize_t callcount = 0;
2030 PyObject **callresults = NULL;
2031 PyObject **callresult = NULL;
2032 Py_ssize_t n = 0;
2033 int width = 0;
2034 int precision = 0;
2035 int zeropad;
2036 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002037 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002038 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002039 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002040 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2041 Py_UCS4 argmaxchar;
2042 Py_ssize_t numbersize = 0;
2043 char *numberresults = NULL;
2044 char *numberresult = NULL;
2045 Py_ssize_t i;
2046 int kind;
2047 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002048
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002049 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002050 /* step 1: count the number of %S/%R/%A/%s format specifications
2051 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2052 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002053 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002054 * also estimate a upper bound for all the number formats in the string,
2055 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002056 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002057 for (f = format; *f; f++) {
2058 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002059 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002060 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2061 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2062 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2063 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002064
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002065 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002066#ifdef HAVE_LONG_LONG
2067 if (longlongflag) {
2068 if (width < MAX_LONG_LONG_CHARS)
2069 width = MAX_LONG_LONG_CHARS;
2070 }
2071 else
2072#endif
2073 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2074 including sign. Decimal takes the most space. This
2075 isn't enough for octal. If a width is specified we
2076 need more (which we allocate later). */
2077 if (width < MAX_LONG_CHARS)
2078 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002079
2080 /* account for the size + '\0' to separate numbers
2081 inside of the numberresults buffer */
2082 numbersize += (width + 1);
2083 }
2084 }
2085 else if ((unsigned char)*f > 127) {
2086 PyErr_Format(PyExc_ValueError,
2087 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2088 "string, got a non-ASCII byte: 0x%02x",
2089 (unsigned char)*f);
2090 return NULL;
2091 }
2092 }
2093 /* step 2: allocate memory for the results of
2094 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2095 if (callcount) {
2096 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2097 if (!callresults) {
2098 PyErr_NoMemory();
2099 return NULL;
2100 }
2101 callresult = callresults;
2102 }
2103 /* step 2.5: allocate memory for the results of formating numbers */
2104 if (numbersize) {
2105 numberresults = PyObject_Malloc(numbersize);
2106 if (!numberresults) {
2107 PyErr_NoMemory();
2108 goto fail;
2109 }
2110 numberresult = numberresults;
2111 }
2112
2113 /* step 3: format numbers and figure out how large a buffer we need */
2114 for (f = format; *f; f++) {
2115 if (*f == '%') {
2116 const char* p;
2117 int longflag;
2118 int longlongflag;
2119 int size_tflag;
2120 int numprinted;
2121
2122 p = f;
2123 zeropad = (f[1] == '0');
2124 f = parse_format_flags(f, &width, &precision,
2125 &longflag, &longlongflag, &size_tflag);
2126 switch (*f) {
2127 case 'c':
2128 {
2129 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002130 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002131 n++;
2132 break;
2133 }
2134 case '%':
2135 n++;
2136 break;
2137 case 'i':
2138 case 'd':
2139 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2140 width, precision, *f);
2141 if (longflag)
2142 numprinted = sprintf(numberresult, fmt,
2143 va_arg(count, long));
2144#ifdef HAVE_LONG_LONG
2145 else if (longlongflag)
2146 numprinted = sprintf(numberresult, fmt,
2147 va_arg(count, PY_LONG_LONG));
2148#endif
2149 else if (size_tflag)
2150 numprinted = sprintf(numberresult, fmt,
2151 va_arg(count, Py_ssize_t));
2152 else
2153 numprinted = sprintf(numberresult, fmt,
2154 va_arg(count, int));
2155 n += numprinted;
2156 /* advance by +1 to skip over the '\0' */
2157 numberresult += (numprinted + 1);
2158 assert(*(numberresult - 1) == '\0');
2159 assert(*(numberresult - 2) != '\0');
2160 assert(numprinted >= 0);
2161 assert(numberresult <= numberresults + numbersize);
2162 break;
2163 case 'u':
2164 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2165 width, precision, 'u');
2166 if (longflag)
2167 numprinted = sprintf(numberresult, fmt,
2168 va_arg(count, unsigned long));
2169#ifdef HAVE_LONG_LONG
2170 else if (longlongflag)
2171 numprinted = sprintf(numberresult, fmt,
2172 va_arg(count, unsigned PY_LONG_LONG));
2173#endif
2174 else if (size_tflag)
2175 numprinted = sprintf(numberresult, fmt,
2176 va_arg(count, size_t));
2177 else
2178 numprinted = sprintf(numberresult, fmt,
2179 va_arg(count, unsigned int));
2180 n += numprinted;
2181 numberresult += (numprinted + 1);
2182 assert(*(numberresult - 1) == '\0');
2183 assert(*(numberresult - 2) != '\0');
2184 assert(numprinted >= 0);
2185 assert(numberresult <= numberresults + numbersize);
2186 break;
2187 case 'x':
2188 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2189 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2190 n += numprinted;
2191 numberresult += (numprinted + 1);
2192 assert(*(numberresult - 1) == '\0');
2193 assert(*(numberresult - 2) != '\0');
2194 assert(numprinted >= 0);
2195 assert(numberresult <= numberresults + numbersize);
2196 break;
2197 case 'p':
2198 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2199 /* %p is ill-defined: ensure leading 0x. */
2200 if (numberresult[1] == 'X')
2201 numberresult[1] = 'x';
2202 else if (numberresult[1] != 'x') {
2203 memmove(numberresult + 2, numberresult,
2204 strlen(numberresult) + 1);
2205 numberresult[0] = '0';
2206 numberresult[1] = 'x';
2207 numprinted += 2;
2208 }
2209 n += numprinted;
2210 numberresult += (numprinted + 1);
2211 assert(*(numberresult - 1) == '\0');
2212 assert(*(numberresult - 2) != '\0');
2213 assert(numprinted >= 0);
2214 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002215 break;
2216 case 's':
2217 {
2218 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002219 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002220 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2221 if (!str)
2222 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002223 /* since PyUnicode_DecodeUTF8 returns already flexible
2224 unicode objects, there is no need to call ready on them */
2225 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002226 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002227 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002228 /* Remember the str and switch to the next slot */
2229 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002230 break;
2231 }
2232 case 'U':
2233 {
2234 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002235 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002236 if (PyUnicode_READY(obj) == -1)
2237 goto fail;
2238 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002239 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002240 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002241 break;
2242 }
2243 case 'V':
2244 {
2245 PyObject *obj = va_arg(count, PyObject *);
2246 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002247 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002248 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002249 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002250 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002251 if (PyUnicode_READY(obj) == -1)
2252 goto fail;
2253 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002254 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002255 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002256 *callresult++ = NULL;
2257 }
2258 else {
2259 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2260 if (!str_obj)
2261 goto fail;
Victor Stinnere1335c72011-10-04 20:53:03 +02002262 if (PyUnicode_READY(str_obj)) {
2263 Py_DECREF(str_obj);
2264 goto fail;
2265 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002266 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002267 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002268 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002269 *callresult++ = str_obj;
2270 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002271 break;
2272 }
2273 case 'S':
2274 {
2275 PyObject *obj = va_arg(count, PyObject *);
2276 PyObject *str;
2277 assert(obj);
2278 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002279 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002280 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002281 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002282 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002283 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002284 /* Remember the str and switch to the next slot */
2285 *callresult++ = str;
2286 break;
2287 }
2288 case 'R':
2289 {
2290 PyObject *obj = va_arg(count, PyObject *);
2291 PyObject *repr;
2292 assert(obj);
2293 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002294 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002295 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002296 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002297 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002298 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002299 /* Remember the repr and switch to the next slot */
2300 *callresult++ = repr;
2301 break;
2302 }
2303 case 'A':
2304 {
2305 PyObject *obj = va_arg(count, PyObject *);
2306 PyObject *ascii;
2307 assert(obj);
2308 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002309 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002310 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002311 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002312 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002313 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002314 /* Remember the repr and switch to the next slot */
2315 *callresult++ = ascii;
2316 break;
2317 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002318 default:
2319 /* if we stumble upon an unknown
2320 formatting code, copy the rest of
2321 the format string to the output
2322 string. (we cannot just skip the
2323 code, since there's no way to know
2324 what's in the argument list) */
2325 n += strlen(p);
2326 goto expand;
2327 }
2328 } else
2329 n++;
2330 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002331 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002332 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002333 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002334 we don't have to resize the string.
2335 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002336 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002337 if (!string)
2338 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002339 kind = PyUnicode_KIND(string);
2340 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002341 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002342 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002343
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002344 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002345 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002346 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002347
2348 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002349 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2350 /* checking for == because the last argument could be a empty
2351 string, which causes i to point to end, the assert at the end of
2352 the loop */
2353 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002354
Benjamin Peterson14339b62009-01-31 16:36:08 +00002355 switch (*f) {
2356 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002357 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002358 const int ordinal = va_arg(vargs, int);
2359 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002360 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002361 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002362 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002363 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002364 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002365 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002366 case 'p':
2367 /* unused, since we already have the result */
2368 if (*f == 'p')
2369 (void) va_arg(vargs, void *);
2370 else
2371 (void) va_arg(vargs, int);
2372 /* extract the result from numberresults and append. */
2373 for (; *numberresult; ++i, ++numberresult)
2374 PyUnicode_WRITE(kind, data, i, *numberresult);
2375 /* skip over the separating '\0' */
2376 assert(*numberresult == '\0');
2377 numberresult++;
2378 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002379 break;
2380 case 's':
2381 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002382 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002383 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002384 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002385 size = PyUnicode_GET_LENGTH(*callresult);
2386 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002387 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002388 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002389 /* We're done with the unicode()/repr() => forget it */
2390 Py_DECREF(*callresult);
2391 /* switch to next unicode()/repr() result */
2392 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002393 break;
2394 }
2395 case 'U':
2396 {
2397 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002398 Py_ssize_t size;
2399 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2400 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002401 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002402 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002403 break;
2404 }
2405 case 'V':
2406 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002407 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002408 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002409 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002410 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002411 size = PyUnicode_GET_LENGTH(obj);
2412 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002413 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002414 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002415 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002416 size = PyUnicode_GET_LENGTH(*callresult);
2417 assert(PyUnicode_KIND(*callresult) <=
2418 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002419 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002420 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002421 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002422 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002423 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002424 break;
2425 }
2426 case 'S':
2427 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002428 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002429 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002430 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002431 /* unused, since we already have the result */
2432 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002433 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002434 copy_characters(string, i, *callresult, 0, size);
2435 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002436 /* We're done with the unicode()/repr() => forget it */
2437 Py_DECREF(*callresult);
2438 /* switch to next unicode()/repr() result */
2439 ++callresult;
2440 break;
2441 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002442 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002443 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002444 break;
2445 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002446 for (; *p; ++p, ++i)
2447 PyUnicode_WRITE(kind, data, i, *p);
2448 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002449 goto end;
2450 }
Victor Stinner1205f272010-09-11 00:54:47 +00002451 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002452 else {
2453 assert(i < PyUnicode_GET_LENGTH(string));
2454 PyUnicode_WRITE(kind, data, i++, *f);
2455 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002456 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002457 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002458
Benjamin Peterson29060642009-01-31 22:14:21 +00002459 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002460 if (callresults)
2461 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002462 if (numberresults)
2463 PyObject_Free(numberresults);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002464 assert(_PyUnicode_CheckConsistency(string, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002465 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00002466 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002467 if (callresults) {
2468 PyObject **callresult2 = callresults;
2469 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002470 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002471 ++callresult2;
2472 }
2473 PyObject_Free(callresults);
2474 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002475 if (numberresults)
2476 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002477 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002478}
2479
Walter Dörwaldd2034312007-05-18 16:29:38 +00002480PyObject *
2481PyUnicode_FromFormat(const char *format, ...)
2482{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002483 PyObject* ret;
2484 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002485
2486#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002487 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002488#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002489 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002490#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002491 ret = PyUnicode_FromFormatV(format, vargs);
2492 va_end(vargs);
2493 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002494}
2495
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002496#ifdef HAVE_WCHAR_H
2497
Victor Stinner5593d8a2010-10-02 11:11:27 +00002498/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2499 convert a Unicode object to a wide character string.
2500
Victor Stinnerd88d9832011-09-06 02:00:05 +02002501 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002502 character) required to convert the unicode object. Ignore size argument.
2503
Victor Stinnerd88d9832011-09-06 02:00:05 +02002504 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002505 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002506 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002507static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00002508unicode_aswidechar(PyUnicodeObject *unicode,
2509 wchar_t *w,
2510 Py_ssize_t size)
2511{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002512 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002513 const wchar_t *wstr;
2514
2515 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
2516 if (wstr == NULL)
2517 return -1;
2518
Victor Stinner5593d8a2010-10-02 11:11:27 +00002519 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002520 if (size > res)
2521 size = res + 1;
2522 else
2523 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002524 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002525 return res;
2526 }
2527 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002528 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002529}
2530
2531Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002532PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002533 wchar_t *w,
2534 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002535{
2536 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002537 PyErr_BadInternalCall();
2538 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002539 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002540 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002541}
2542
Victor Stinner137c34c2010-09-29 10:25:54 +00002543wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002544PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002545 Py_ssize_t *size)
2546{
2547 wchar_t* buffer;
2548 Py_ssize_t buflen;
2549
2550 if (unicode == NULL) {
2551 PyErr_BadInternalCall();
2552 return NULL;
2553 }
2554
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002555 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002556 if (buflen == -1)
2557 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002558 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002559 PyErr_NoMemory();
2560 return NULL;
2561 }
2562
Victor Stinner137c34c2010-09-29 10:25:54 +00002563 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2564 if (buffer == NULL) {
2565 PyErr_NoMemory();
2566 return NULL;
2567 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002568 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002569 if (buflen == -1)
2570 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002571 if (size != NULL)
2572 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002573 return buffer;
2574}
2575
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002576#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002577
Alexander Belopolsky40018472011-02-26 01:02:56 +00002578PyObject *
2579PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002580{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002581 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002582 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002583 PyErr_SetString(PyExc_ValueError,
2584 "chr() arg not in range(0x110000)");
2585 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002586 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002587
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002588 if (ordinal < 256)
2589 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002590
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002591 v = PyUnicode_New(1, ordinal);
2592 if (v == NULL)
2593 return NULL;
2594 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002595 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002596 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002597}
2598
Alexander Belopolsky40018472011-02-26 01:02:56 +00002599PyObject *
2600PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002601{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002602 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002603 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002604 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002605 if (PyUnicode_READY(obj))
2606 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002607 Py_INCREF(obj);
2608 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002609 }
2610 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002611 /* For a Unicode subtype that's not a Unicode object,
2612 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002613 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002614 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002615 PyErr_Format(PyExc_TypeError,
2616 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002617 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002618 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002619}
2620
Alexander Belopolsky40018472011-02-26 01:02:56 +00002621PyObject *
2622PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002623 const char *encoding,
2624 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002625{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002626 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002627 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002628
Guido van Rossumd57fd912000-03-10 22:53:23 +00002629 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002630 PyErr_BadInternalCall();
2631 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002632 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002633
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002634 /* Decoding bytes objects is the most common case and should be fast */
2635 if (PyBytes_Check(obj)) {
2636 if (PyBytes_GET_SIZE(obj) == 0) {
2637 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002638 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002639 }
2640 else {
2641 v = PyUnicode_Decode(
2642 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2643 encoding, errors);
2644 }
2645 return v;
2646 }
2647
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002648 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002649 PyErr_SetString(PyExc_TypeError,
2650 "decoding str is not supported");
2651 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002652 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002653
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002654 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2655 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2656 PyErr_Format(PyExc_TypeError,
2657 "coercing to str: need bytes, bytearray "
2658 "or buffer-like object, %.80s found",
2659 Py_TYPE(obj)->tp_name);
2660 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002661 }
Tim Petersced69f82003-09-16 20:30:58 +00002662
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002663 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002664 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002665 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002666 }
Tim Petersced69f82003-09-16 20:30:58 +00002667 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002668 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002669
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002670 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002671 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002672}
2673
Victor Stinner600d3be2010-06-10 12:00:55 +00002674/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002675 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2676 1 on success. */
2677static int
2678normalize_encoding(const char *encoding,
2679 char *lower,
2680 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002681{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002682 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002683 char *l;
2684 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002685
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002686 e = encoding;
2687 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002688 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002689 while (*e) {
2690 if (l == l_end)
2691 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002692 if (Py_ISUPPER(*e)) {
2693 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002694 }
2695 else if (*e == '_') {
2696 *l++ = '-';
2697 e++;
2698 }
2699 else {
2700 *l++ = *e++;
2701 }
2702 }
2703 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002704 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002705}
2706
Alexander Belopolsky40018472011-02-26 01:02:56 +00002707PyObject *
2708PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002709 Py_ssize_t size,
2710 const char *encoding,
2711 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002712{
2713 PyObject *buffer = NULL, *unicode;
2714 Py_buffer info;
2715 char lower[11]; /* Enough for any encoding shortcut */
2716
2717 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002718 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002719
2720 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002721 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002722 if ((strcmp(lower, "utf-8") == 0) ||
2723 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002724 return PyUnicode_DecodeUTF8(s, size, errors);
2725 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002726 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002727 (strcmp(lower, "iso-8859-1") == 0))
2728 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002729#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002730 else if (strcmp(lower, "mbcs") == 0)
2731 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002732#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002733 else if (strcmp(lower, "ascii") == 0)
2734 return PyUnicode_DecodeASCII(s, size, errors);
2735 else if (strcmp(lower, "utf-16") == 0)
2736 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2737 else if (strcmp(lower, "utf-32") == 0)
2738 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2739 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002740
2741 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002742 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002743 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002744 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002745 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002746 if (buffer == NULL)
2747 goto onError;
2748 unicode = PyCodec_Decode(buffer, encoding, errors);
2749 if (unicode == NULL)
2750 goto onError;
2751 if (!PyUnicode_Check(unicode)) {
2752 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002753 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002754 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002755 Py_DECREF(unicode);
2756 goto onError;
2757 }
2758 Py_DECREF(buffer);
Victor Stinner17efeed2011-10-04 20:05:46 +02002759#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02002760 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002761 Py_DECREF(unicode);
2762 return NULL;
2763 }
Victor Stinner17efeed2011-10-04 20:05:46 +02002764#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002765 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002766 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002767
Benjamin Peterson29060642009-01-31 22:14:21 +00002768 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002769 Py_XDECREF(buffer);
2770 return NULL;
2771}
2772
Alexander Belopolsky40018472011-02-26 01:02:56 +00002773PyObject *
2774PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002775 const char *encoding,
2776 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002777{
2778 PyObject *v;
2779
2780 if (!PyUnicode_Check(unicode)) {
2781 PyErr_BadArgument();
2782 goto onError;
2783 }
2784
2785 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002786 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002787
2788 /* Decode via the codec registry */
2789 v = PyCodec_Decode(unicode, encoding, errors);
2790 if (v == NULL)
2791 goto onError;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002792 assert(_PyUnicode_CheckConsistency(v, 1));
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002793 return v;
2794
Benjamin Peterson29060642009-01-31 22:14:21 +00002795 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002796 return NULL;
2797}
2798
Alexander Belopolsky40018472011-02-26 01:02:56 +00002799PyObject *
2800PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002801 const char *encoding,
2802 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002803{
2804 PyObject *v;
2805
2806 if (!PyUnicode_Check(unicode)) {
2807 PyErr_BadArgument();
2808 goto onError;
2809 }
2810
2811 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002812 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002813
2814 /* Decode via the codec registry */
2815 v = PyCodec_Decode(unicode, encoding, errors);
2816 if (v == NULL)
2817 goto onError;
2818 if (!PyUnicode_Check(v)) {
2819 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002820 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002821 Py_TYPE(v)->tp_name);
2822 Py_DECREF(v);
2823 goto onError;
2824 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002825 assert(_PyUnicode_CheckConsistency(v, 1));
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002826 return v;
2827
Benjamin Peterson29060642009-01-31 22:14:21 +00002828 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002829 return NULL;
2830}
2831
Alexander Belopolsky40018472011-02-26 01:02:56 +00002832PyObject *
2833PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002834 Py_ssize_t size,
2835 const char *encoding,
2836 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002837{
2838 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002839
Guido van Rossumd57fd912000-03-10 22:53:23 +00002840 unicode = PyUnicode_FromUnicode(s, size);
2841 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002842 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002843 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2844 Py_DECREF(unicode);
2845 return v;
2846}
2847
Alexander Belopolsky40018472011-02-26 01:02:56 +00002848PyObject *
2849PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002850 const char *encoding,
2851 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002852{
2853 PyObject *v;
2854
2855 if (!PyUnicode_Check(unicode)) {
2856 PyErr_BadArgument();
2857 goto onError;
2858 }
2859
2860 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002861 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002862
2863 /* Encode via the codec registry */
2864 v = PyCodec_Encode(unicode, encoding, errors);
2865 if (v == NULL)
2866 goto onError;
2867 return v;
2868
Benjamin Peterson29060642009-01-31 22:14:21 +00002869 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002870 return NULL;
2871}
2872
Victor Stinnerad158722010-10-27 00:25:46 +00002873PyObject *
2874PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002875{
Victor Stinner99b95382011-07-04 14:23:54 +02002876#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002877 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2878 PyUnicode_GET_SIZE(unicode),
2879 NULL);
2880#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002881 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002882#else
Victor Stinner793b5312011-04-27 00:24:21 +02002883 PyInterpreterState *interp = PyThreadState_GET()->interp;
2884 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2885 cannot use it to encode and decode filenames before it is loaded. Load
2886 the Python codec requires to encode at least its own filename. Use the C
2887 version of the locale codec until the codec registry is initialized and
2888 the Python codec is loaded.
2889
2890 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2891 cannot only rely on it: check also interp->fscodec_initialized for
2892 subinterpreters. */
2893 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00002894 return PyUnicode_AsEncodedString(unicode,
2895 Py_FileSystemDefaultEncoding,
2896 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00002897 }
2898 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002899 /* locale encoding with surrogateescape */
2900 wchar_t *wchar;
2901 char *bytes;
2902 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00002903 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002904
2905 wchar = PyUnicode_AsWideCharString(unicode, NULL);
2906 if (wchar == NULL)
2907 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002908 bytes = _Py_wchar2char(wchar, &error_pos);
2909 if (bytes == NULL) {
2910 if (error_pos != (size_t)-1) {
2911 char *errmsg = strerror(errno);
2912 PyObject *exc = NULL;
2913 if (errmsg == NULL)
2914 errmsg = "Py_wchar2char() failed";
2915 raise_encode_exception(&exc,
2916 "filesystemencoding",
2917 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2918 error_pos, error_pos+1,
2919 errmsg);
2920 Py_XDECREF(exc);
2921 }
2922 else
2923 PyErr_NoMemory();
2924 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002925 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002926 }
2927 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002928
2929 bytes_obj = PyBytes_FromString(bytes);
2930 PyMem_Free(bytes);
2931 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00002932 }
Victor Stinnerad158722010-10-27 00:25:46 +00002933#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00002934}
2935
Alexander Belopolsky40018472011-02-26 01:02:56 +00002936PyObject *
2937PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002938 const char *encoding,
2939 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002940{
2941 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00002942 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00002943
Guido van Rossumd57fd912000-03-10 22:53:23 +00002944 if (!PyUnicode_Check(unicode)) {
2945 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002946 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002947 }
Fred Drakee4315f52000-05-09 19:53:39 +00002948
Victor Stinner2f283c22011-03-02 01:21:46 +00002949 if (encoding == NULL) {
2950 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002951 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002952 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002953 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00002954 }
Fred Drakee4315f52000-05-09 19:53:39 +00002955
2956 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002957 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002958 if ((strcmp(lower, "utf-8") == 0) ||
2959 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00002960 {
Victor Stinner2f283c22011-03-02 01:21:46 +00002961 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002962 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002963 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002964 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00002965 }
Victor Stinner37296e82010-06-10 13:36:23 +00002966 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002967 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002968 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002969 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002970#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002971 else if (strcmp(lower, "mbcs") == 0)
2972 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2973 PyUnicode_GET_SIZE(unicode),
2974 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002975#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002976 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002977 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00002978 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002979
2980 /* Encode via the codec registry */
2981 v = PyCodec_Encode(unicode, encoding, errors);
2982 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002983 return NULL;
2984
2985 /* The normal path */
2986 if (PyBytes_Check(v))
2987 return v;
2988
2989 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002990 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002991 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002992 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002993
2994 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2995 "encoder %s returned bytearray instead of bytes",
2996 encoding);
2997 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002998 Py_DECREF(v);
2999 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003000 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003001
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003002 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3003 Py_DECREF(v);
3004 return b;
3005 }
3006
3007 PyErr_Format(PyExc_TypeError,
3008 "encoder did not return a bytes object (type=%.400s)",
3009 Py_TYPE(v)->tp_name);
3010 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003011 return NULL;
3012}
3013
Alexander Belopolsky40018472011-02-26 01:02:56 +00003014PyObject *
3015PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003016 const char *encoding,
3017 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003018{
3019 PyObject *v;
3020
3021 if (!PyUnicode_Check(unicode)) {
3022 PyErr_BadArgument();
3023 goto onError;
3024 }
3025
3026 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003027 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003028
3029 /* Encode via the codec registry */
3030 v = PyCodec_Encode(unicode, encoding, errors);
3031 if (v == NULL)
3032 goto onError;
3033 if (!PyUnicode_Check(v)) {
3034 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003035 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003036 Py_TYPE(v)->tp_name);
3037 Py_DECREF(v);
3038 goto onError;
3039 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003040 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003041
Benjamin Peterson29060642009-01-31 22:14:21 +00003042 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003043 return NULL;
3044}
3045
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003046PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003047PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003048 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003049 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3050}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003051
Christian Heimes5894ba72007-11-04 11:43:14 +00003052PyObject*
3053PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3054{
Victor Stinner99b95382011-07-04 14:23:54 +02003055#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003056 return PyUnicode_DecodeMBCS(s, size, NULL);
3057#elif defined(__APPLE__)
3058 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
3059#else
Victor Stinner793b5312011-04-27 00:24:21 +02003060 PyInterpreterState *interp = PyThreadState_GET()->interp;
3061 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3062 cannot use it to encode and decode filenames before it is loaded. Load
3063 the Python codec requires to encode at least its own filename. Use the C
3064 version of the locale codec until the codec registry is initialized and
3065 the Python codec is loaded.
3066
3067 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3068 cannot only rely on it: check also interp->fscodec_initialized for
3069 subinterpreters. */
3070 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003071 return PyUnicode_Decode(s, size,
3072 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003073 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003074 }
3075 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003076 /* locale encoding with surrogateescape */
3077 wchar_t *wchar;
3078 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00003079 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003080
3081 if (s[size] != '\0' || size != strlen(s)) {
3082 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3083 return NULL;
3084 }
3085
Victor Stinner168e1172010-10-16 23:16:16 +00003086 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003087 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00003088 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003089
Victor Stinner168e1172010-10-16 23:16:16 +00003090 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003091 PyMem_Free(wchar);
3092 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003093 }
Victor Stinnerad158722010-10-27 00:25:46 +00003094#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003095}
3096
Martin v. Löwis011e8422009-05-05 04:43:17 +00003097
3098int
3099PyUnicode_FSConverter(PyObject* arg, void* addr)
3100{
3101 PyObject *output = NULL;
3102 Py_ssize_t size;
3103 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003104 if (arg == NULL) {
3105 Py_DECREF(*(PyObject**)addr);
3106 return 1;
3107 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003108 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003109 output = arg;
3110 Py_INCREF(output);
3111 }
3112 else {
3113 arg = PyUnicode_FromObject(arg);
3114 if (!arg)
3115 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003116 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003117 Py_DECREF(arg);
3118 if (!output)
3119 return 0;
3120 if (!PyBytes_Check(output)) {
3121 Py_DECREF(output);
3122 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3123 return 0;
3124 }
3125 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003126 size = PyBytes_GET_SIZE(output);
3127 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003128 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003129 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003130 Py_DECREF(output);
3131 return 0;
3132 }
3133 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003134 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003135}
3136
3137
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003138int
3139PyUnicode_FSDecoder(PyObject* arg, void* addr)
3140{
3141 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003142 if (arg == NULL) {
3143 Py_DECREF(*(PyObject**)addr);
3144 return 1;
3145 }
3146 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003147 if (PyUnicode_READY(arg))
3148 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003149 output = arg;
3150 Py_INCREF(output);
3151 }
3152 else {
3153 arg = PyBytes_FromObject(arg);
3154 if (!arg)
3155 return 0;
3156 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3157 PyBytes_GET_SIZE(arg));
3158 Py_DECREF(arg);
3159 if (!output)
3160 return 0;
3161 if (!PyUnicode_Check(output)) {
3162 Py_DECREF(output);
3163 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3164 return 0;
3165 }
3166 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003167 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3168 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003169 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3170 Py_DECREF(output);
3171 return 0;
3172 }
3173 *(PyObject**)addr = output;
3174 return Py_CLEANUP_SUPPORTED;
3175}
3176
3177
Martin v. Löwis5b222132007-06-10 09:51:05 +00003178char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003179PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003180{
Christian Heimesf3863112007-11-22 07:46:41 +00003181 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003182 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
3183
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003184 if (!PyUnicode_Check(unicode)) {
3185 PyErr_BadArgument();
3186 return NULL;
3187 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003188 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003189 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003190
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003191 if (PyUnicode_UTF8(unicode) == NULL) {
3192 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003193 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3194 if (bytes == NULL)
3195 return NULL;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003196 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3197 if (_PyUnicode_UTF8(u) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003198 Py_DECREF(bytes);
3199 return NULL;
3200 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003201 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
3202 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003203 Py_DECREF(bytes);
3204 }
3205
3206 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003207 *psize = PyUnicode_UTF8_LENGTH(unicode);
3208 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003209}
3210
3211char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003212PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003213{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003214 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3215}
3216
3217#ifdef Py_DEBUG
3218int unicode_as_unicode_calls = 0;
3219#endif
3220
3221
3222Py_UNICODE *
3223PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3224{
3225 PyUnicodeObject *u;
3226 const unsigned char *one_byte;
3227#if SIZEOF_WCHAR_T == 4
3228 const Py_UCS2 *two_bytes;
3229#else
3230 const Py_UCS4 *four_bytes;
3231 const Py_UCS4 *ucs4_end;
3232 Py_ssize_t num_surrogates;
3233#endif
3234 wchar_t *w;
3235 wchar_t *wchar_end;
3236
3237 if (!PyUnicode_Check(unicode)) {
3238 PyErr_BadArgument();
3239 return NULL;
3240 }
3241 u = (PyUnicodeObject*)unicode;
3242 if (_PyUnicode_WSTR(u) == NULL) {
3243 /* Non-ASCII compact unicode object */
3244 assert(_PyUnicode_KIND(u) != 0);
3245 assert(PyUnicode_IS_READY(u));
3246
3247#ifdef Py_DEBUG
3248 ++unicode_as_unicode_calls;
3249#endif
3250
3251 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
3252#if SIZEOF_WCHAR_T == 2
3253 four_bytes = PyUnicode_4BYTE_DATA(u);
3254 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
3255 num_surrogates = 0;
3256
3257 for (; four_bytes < ucs4_end; ++four_bytes) {
3258 if (*four_bytes > 0xFFFF)
3259 ++num_surrogates;
3260 }
3261
3262 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
3263 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
3264 if (!_PyUnicode_WSTR(u)) {
3265 PyErr_NoMemory();
3266 return NULL;
3267 }
3268 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
3269
3270 w = _PyUnicode_WSTR(u);
3271 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
3272 four_bytes = PyUnicode_4BYTE_DATA(u);
3273 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3274 if (*four_bytes > 0xFFFF) {
3275 /* encode surrogate pair in this case */
3276 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3277 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3278 }
3279 else
3280 *w = *four_bytes;
3281
3282 if (w > wchar_end) {
3283 assert(0 && "Miscalculated string end");
3284 }
3285 }
3286 *w = 0;
3287#else
3288 /* sizeof(wchar_t) == 4 */
3289 Py_FatalError("Impossible unicode object state, wstr and str "
3290 "should share memory already.");
3291 return NULL;
3292#endif
3293 }
3294 else {
3295 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3296 (_PyUnicode_LENGTH(u) + 1));
3297 if (!_PyUnicode_WSTR(u)) {
3298 PyErr_NoMemory();
3299 return NULL;
3300 }
3301 if (!PyUnicode_IS_COMPACT_ASCII(u))
3302 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
3303 w = _PyUnicode_WSTR(u);
3304 wchar_end = w + _PyUnicode_LENGTH(u);
3305
3306 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
3307 one_byte = PyUnicode_1BYTE_DATA(u);
3308 for (; w < wchar_end; ++one_byte, ++w)
3309 *w = *one_byte;
3310 /* null-terminate the wstr */
3311 *w = 0;
3312 }
3313 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
3314#if SIZEOF_WCHAR_T == 4
3315 two_bytes = PyUnicode_2BYTE_DATA(u);
3316 for (; w < wchar_end; ++two_bytes, ++w)
3317 *w = *two_bytes;
3318 /* null-terminate the wstr */
3319 *w = 0;
3320#else
3321 /* sizeof(wchar_t) == 2 */
3322 PyObject_FREE(_PyUnicode_WSTR(u));
3323 _PyUnicode_WSTR(u) = NULL;
3324 Py_FatalError("Impossible unicode object state, wstr "
3325 "and str should share memory already.");
3326 return NULL;
3327#endif
3328 }
3329 else {
3330 assert(0 && "This should never happen.");
3331 }
3332 }
3333 }
3334 if (size != NULL)
3335 *size = PyUnicode_WSTR_LENGTH(u);
3336 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003337}
3338
Alexander Belopolsky40018472011-02-26 01:02:56 +00003339Py_UNICODE *
3340PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003341{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003342 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003343}
3344
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003345
Alexander Belopolsky40018472011-02-26 01:02:56 +00003346Py_ssize_t
3347PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003348{
3349 if (!PyUnicode_Check(unicode)) {
3350 PyErr_BadArgument();
3351 goto onError;
3352 }
3353 return PyUnicode_GET_SIZE(unicode);
3354
Benjamin Peterson29060642009-01-31 22:14:21 +00003355 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003356 return -1;
3357}
3358
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003359Py_ssize_t
3360PyUnicode_GetLength(PyObject *unicode)
3361{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003362 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003363 PyErr_BadArgument();
3364 return -1;
3365 }
3366
3367 return PyUnicode_GET_LENGTH(unicode);
3368}
3369
3370Py_UCS4
3371PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3372{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003373 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3374 PyErr_BadArgument();
3375 return (Py_UCS4)-1;
3376 }
3377 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3378 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003379 return (Py_UCS4)-1;
3380 }
3381 return PyUnicode_READ_CHAR(unicode, index);
3382}
3383
3384int
3385PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3386{
3387 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003388 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003389 return -1;
3390 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003391 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3392 PyErr_SetString(PyExc_IndexError, "string index out of range");
3393 return -1;
3394 }
3395 if (_PyUnicode_Dirty(unicode))
3396 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003397 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3398 index, ch);
3399 return 0;
3400}
3401
Alexander Belopolsky40018472011-02-26 01:02:56 +00003402const char *
3403PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003404{
Victor Stinner42cb4622010-09-01 19:39:01 +00003405 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003406}
3407
Victor Stinner554f3f02010-06-16 23:33:54 +00003408/* create or adjust a UnicodeDecodeError */
3409static void
3410make_decode_exception(PyObject **exceptionObject,
3411 const char *encoding,
3412 const char *input, Py_ssize_t length,
3413 Py_ssize_t startpos, Py_ssize_t endpos,
3414 const char *reason)
3415{
3416 if (*exceptionObject == NULL) {
3417 *exceptionObject = PyUnicodeDecodeError_Create(
3418 encoding, input, length, startpos, endpos, reason);
3419 }
3420 else {
3421 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3422 goto onError;
3423 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3424 goto onError;
3425 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3426 goto onError;
3427 }
3428 return;
3429
3430onError:
3431 Py_DECREF(*exceptionObject);
3432 *exceptionObject = NULL;
3433}
3434
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003435/* error handling callback helper:
3436 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003437 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003438 and adjust various state variables.
3439 return 0 on success, -1 on error
3440*/
3441
Alexander Belopolsky40018472011-02-26 01:02:56 +00003442static int
3443unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003444 const char *encoding, const char *reason,
3445 const char **input, const char **inend, Py_ssize_t *startinpos,
3446 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3447 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003448{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003449 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003450
3451 PyObject *restuple = NULL;
3452 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003453 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003454 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003455 Py_ssize_t requiredsize;
3456 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003457 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003458 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003459 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003460 int res = -1;
3461
3462 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003463 *errorHandler = PyCodec_LookupError(errors);
3464 if (*errorHandler == NULL)
3465 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003466 }
3467
Victor Stinner554f3f02010-06-16 23:33:54 +00003468 make_decode_exception(exceptionObject,
3469 encoding,
3470 *input, *inend - *input,
3471 *startinpos, *endinpos,
3472 reason);
3473 if (*exceptionObject == NULL)
3474 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003475
3476 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3477 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003478 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003479 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003480 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003481 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003482 }
3483 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003484 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003485
3486 /* Copy back the bytes variables, which might have been modified by the
3487 callback */
3488 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3489 if (!inputobj)
3490 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003491 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003492 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003493 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003494 *input = PyBytes_AS_STRING(inputobj);
3495 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003496 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003497 /* we can DECREF safely, as the exception has another reference,
3498 so the object won't go away. */
3499 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003500
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003501 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003502 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003503 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003504 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3505 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003506 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003507
3508 /* need more space? (at least enough for what we
3509 have+the replacement+the rest of the string (starting
3510 at the new input position), so we won't have to check space
3511 when there are no errors in the rest of the string) */
3512 repptr = PyUnicode_AS_UNICODE(repunicode);
3513 repsize = PyUnicode_GET_SIZE(repunicode);
3514 requiredsize = *outpos + repsize + insize-newpos;
3515 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003516 if (requiredsize<2*outsize)
3517 requiredsize = 2*outsize;
Victor Stinnerfe226c02011-10-03 03:52:20 +02003518 if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003519 goto onError;
3520 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003521 }
3522 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003523 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003524 Py_UNICODE_COPY(*outptr, repptr, repsize);
3525 *outptr += repsize;
3526 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003527
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003528 /* we made it! */
3529 res = 0;
3530
Benjamin Peterson29060642009-01-31 22:14:21 +00003531 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003532 Py_XDECREF(restuple);
3533 return res;
3534}
3535
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003536/* --- UTF-7 Codec -------------------------------------------------------- */
3537
Antoine Pitrou244651a2009-05-04 18:56:13 +00003538/* See RFC2152 for details. We encode conservatively and decode liberally. */
3539
3540/* Three simple macros defining base-64. */
3541
3542/* Is c a base-64 character? */
3543
3544#define IS_BASE64(c) \
3545 (((c) >= 'A' && (c) <= 'Z') || \
3546 ((c) >= 'a' && (c) <= 'z') || \
3547 ((c) >= '0' && (c) <= '9') || \
3548 (c) == '+' || (c) == '/')
3549
3550/* given that c is a base-64 character, what is its base-64 value? */
3551
3552#define FROM_BASE64(c) \
3553 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3554 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3555 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3556 (c) == '+' ? 62 : 63)
3557
3558/* What is the base-64 character of the bottom 6 bits of n? */
3559
3560#define TO_BASE64(n) \
3561 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3562
3563/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3564 * decoded as itself. We are permissive on decoding; the only ASCII
3565 * byte not decoding to itself is the + which begins a base64
3566 * string. */
3567
3568#define DECODE_DIRECT(c) \
3569 ((c) <= 127 && (c) != '+')
3570
3571/* The UTF-7 encoder treats ASCII characters differently according to
3572 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3573 * the above). See RFC2152. This array identifies these different
3574 * sets:
3575 * 0 : "Set D"
3576 * alphanumeric and '(),-./:?
3577 * 1 : "Set O"
3578 * !"#$%&*;<=>@[]^_`{|}
3579 * 2 : "whitespace"
3580 * ht nl cr sp
3581 * 3 : special (must be base64 encoded)
3582 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3583 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003584
Tim Petersced69f82003-09-16 20:30:58 +00003585static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003586char utf7_category[128] = {
3587/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3588 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3589/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3590 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3591/* sp ! " # $ % & ' ( ) * + , - . / */
3592 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3593/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3594 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3595/* @ A B C D E F G H I J K L M N O */
3596 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3597/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3598 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3599/* ` a b c d e f g h i j k l m n o */
3600 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3601/* p q r s t u v w x y z { | } ~ del */
3602 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003603};
3604
Antoine Pitrou244651a2009-05-04 18:56:13 +00003605/* ENCODE_DIRECT: this character should be encoded as itself. The
3606 * answer depends on whether we are encoding set O as itself, and also
3607 * on whether we are encoding whitespace as itself. RFC2152 makes it
3608 * clear that the answers to these questions vary between
3609 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003610
Antoine Pitrou244651a2009-05-04 18:56:13 +00003611#define ENCODE_DIRECT(c, directO, directWS) \
3612 ((c) < 128 && (c) > 0 && \
3613 ((utf7_category[(c)] == 0) || \
3614 (directWS && (utf7_category[(c)] == 2)) || \
3615 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003616
Alexander Belopolsky40018472011-02-26 01:02:56 +00003617PyObject *
3618PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003619 Py_ssize_t size,
3620 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003621{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003622 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3623}
3624
Antoine Pitrou244651a2009-05-04 18:56:13 +00003625/* The decoder. The only state we preserve is our read position,
3626 * i.e. how many characters we have consumed. So if we end in the
3627 * middle of a shift sequence we have to back off the read position
3628 * and the output to the beginning of the sequence, otherwise we lose
3629 * all the shift state (seen bits, number of bits seen, high
3630 * surrogate). */
3631
Alexander Belopolsky40018472011-02-26 01:02:56 +00003632PyObject *
3633PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003634 Py_ssize_t size,
3635 const char *errors,
3636 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003637{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003638 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003639 Py_ssize_t startinpos;
3640 Py_ssize_t endinpos;
3641 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003642 const char *e;
3643 PyUnicodeObject *unicode;
3644 Py_UNICODE *p;
3645 const char *errmsg = "";
3646 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003647 Py_UNICODE *shiftOutStart;
3648 unsigned int base64bits = 0;
3649 unsigned long base64buffer = 0;
3650 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003651 PyObject *errorHandler = NULL;
3652 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003653
3654 unicode = _PyUnicode_New(size);
3655 if (!unicode)
3656 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003657 if (size == 0) {
3658 if (consumed)
3659 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003660 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003661 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003662
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003663 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003664 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003665 e = s + size;
3666
3667 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003668 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003669 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003670 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003671
Antoine Pitrou244651a2009-05-04 18:56:13 +00003672 if (inShift) { /* in a base-64 section */
3673 if (IS_BASE64(ch)) { /* consume a base-64 character */
3674 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3675 base64bits += 6;
3676 s++;
3677 if (base64bits >= 16) {
3678 /* we have enough bits for a UTF-16 value */
3679 Py_UNICODE outCh = (Py_UNICODE)
3680 (base64buffer >> (base64bits-16));
3681 base64bits -= 16;
3682 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3683 if (surrogate) {
3684 /* expecting a second surrogate */
3685 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3686#ifdef Py_UNICODE_WIDE
3687 *p++ = (((surrogate & 0x3FF)<<10)
3688 | (outCh & 0x3FF)) + 0x10000;
3689#else
3690 *p++ = surrogate;
3691 *p++ = outCh;
3692#endif
3693 surrogate = 0;
3694 }
3695 else {
3696 surrogate = 0;
3697 errmsg = "second surrogate missing";
3698 goto utf7Error;
3699 }
3700 }
3701 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3702 /* first surrogate */
3703 surrogate = outCh;
3704 }
3705 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3706 errmsg = "unexpected second surrogate";
3707 goto utf7Error;
3708 }
3709 else {
3710 *p++ = outCh;
3711 }
3712 }
3713 }
3714 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003715 inShift = 0;
3716 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003717 if (surrogate) {
3718 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003719 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003720 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003721 if (base64bits > 0) { /* left-over bits */
3722 if (base64bits >= 6) {
3723 /* We've seen at least one base-64 character */
3724 errmsg = "partial character in shift sequence";
3725 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003726 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003727 else {
3728 /* Some bits remain; they should be zero */
3729 if (base64buffer != 0) {
3730 errmsg = "non-zero padding bits in shift sequence";
3731 goto utf7Error;
3732 }
3733 }
3734 }
3735 if (ch != '-') {
3736 /* '-' is absorbed; other terminating
3737 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003738 *p++ = ch;
3739 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003740 }
3741 }
3742 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003743 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003744 s++; /* consume '+' */
3745 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003746 s++;
3747 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003748 }
3749 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003750 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003751 shiftOutStart = p;
3752 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003753 }
3754 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003755 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003756 *p++ = ch;
3757 s++;
3758 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003759 else {
3760 startinpos = s-starts;
3761 s++;
3762 errmsg = "unexpected special character";
3763 goto utf7Error;
3764 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003765 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003766utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003767 outpos = p-PyUnicode_AS_UNICODE(unicode);
3768 endinpos = s-starts;
3769 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003770 errors, &errorHandler,
3771 "utf7", errmsg,
3772 &starts, &e, &startinpos, &endinpos, &exc, &s,
3773 &unicode, &outpos, &p))
3774 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003775 }
3776
Antoine Pitrou244651a2009-05-04 18:56:13 +00003777 /* end of string */
3778
3779 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3780 /* if we're in an inconsistent state, that's an error */
3781 if (surrogate ||
3782 (base64bits >= 6) ||
3783 (base64bits > 0 && base64buffer != 0)) {
3784 outpos = p-PyUnicode_AS_UNICODE(unicode);
3785 endinpos = size;
3786 if (unicode_decode_call_errorhandler(
3787 errors, &errorHandler,
3788 "utf7", "unterminated shift sequence",
3789 &starts, &e, &startinpos, &endinpos, &exc, &s,
3790 &unicode, &outpos, &p))
3791 goto onError;
3792 if (s < e)
3793 goto restart;
3794 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003795 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003796
3797 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003798 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003799 if (inShift) {
3800 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003801 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003802 }
3803 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003804 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003805 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003806 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003807
Victor Stinnerfe226c02011-10-03 03:52:20 +02003808 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003809 goto onError;
3810
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003811 Py_XDECREF(errorHandler);
3812 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02003813#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02003814 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003815 Py_DECREF(unicode);
3816 return NULL;
3817 }
Victor Stinner17efeed2011-10-04 20:05:46 +02003818#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02003819 assert(_PyUnicode_CheckConsistency(unicode, 1));
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003820 return (PyObject *)unicode;
3821
Benjamin Peterson29060642009-01-31 22:14:21 +00003822 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003823 Py_XDECREF(errorHandler);
3824 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003825 Py_DECREF(unicode);
3826 return NULL;
3827}
3828
3829
Alexander Belopolsky40018472011-02-26 01:02:56 +00003830PyObject *
3831PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003832 Py_ssize_t size,
3833 int base64SetO,
3834 int base64WhiteSpace,
3835 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003836{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003837 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003838 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003839 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003840 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003841 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003842 unsigned int base64bits = 0;
3843 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003844 char * out;
3845 char * start;
3846
3847 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003848 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003849
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003850 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003851 return PyErr_NoMemory();
3852
Antoine Pitrou244651a2009-05-04 18:56:13 +00003853 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003854 if (v == NULL)
3855 return NULL;
3856
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003857 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003858 for (;i < size; ++i) {
3859 Py_UNICODE ch = s[i];
3860
Antoine Pitrou244651a2009-05-04 18:56:13 +00003861 if (inShift) {
3862 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3863 /* shifting out */
3864 if (base64bits) { /* output remaining bits */
3865 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3866 base64buffer = 0;
3867 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003868 }
3869 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003870 /* Characters not in the BASE64 set implicitly unshift the sequence
3871 so no '-' is required, except if the character is itself a '-' */
3872 if (IS_BASE64(ch) || ch == '-') {
3873 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003874 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003875 *out++ = (char) ch;
3876 }
3877 else {
3878 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003879 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003880 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003881 else { /* not in a shift sequence */
3882 if (ch == '+') {
3883 *out++ = '+';
3884 *out++ = '-';
3885 }
3886 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3887 *out++ = (char) ch;
3888 }
3889 else {
3890 *out++ = '+';
3891 inShift = 1;
3892 goto encode_char;
3893 }
3894 }
3895 continue;
3896encode_char:
3897#ifdef Py_UNICODE_WIDE
3898 if (ch >= 0x10000) {
3899 /* code first surrogate */
3900 base64bits += 16;
3901 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3902 while (base64bits >= 6) {
3903 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3904 base64bits -= 6;
3905 }
3906 /* prepare second surrogate */
3907 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
3908 }
3909#endif
3910 base64bits += 16;
3911 base64buffer = (base64buffer << 16) | ch;
3912 while (base64bits >= 6) {
3913 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3914 base64bits -= 6;
3915 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00003916 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003917 if (base64bits)
3918 *out++= TO_BASE64(base64buffer << (6-base64bits) );
3919 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003920 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003921 if (_PyBytes_Resize(&v, out - start) < 0)
3922 return NULL;
3923 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003924}
3925
Antoine Pitrou244651a2009-05-04 18:56:13 +00003926#undef IS_BASE64
3927#undef FROM_BASE64
3928#undef TO_BASE64
3929#undef DECODE_DIRECT
3930#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003931
Guido van Rossumd57fd912000-03-10 22:53:23 +00003932/* --- UTF-8 Codec -------------------------------------------------------- */
3933
Tim Petersced69f82003-09-16 20:30:58 +00003934static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003935char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00003936 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
3937 illegal prefix. See RFC 3629 for details */
3938 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3939 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003940 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003941 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3942 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3943 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3944 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00003945 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3946 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003947 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3948 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00003949 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3950 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3951 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3952 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3953 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003954};
3955
Alexander Belopolsky40018472011-02-26 01:02:56 +00003956PyObject *
3957PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003958 Py_ssize_t size,
3959 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003960{
Walter Dörwald69652032004-09-07 20:24:22 +00003961 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3962}
3963
Antoine Pitrouab868312009-01-10 15:40:25 +00003964/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3965#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3966
3967/* Mask to quickly check whether a C 'long' contains a
3968 non-ASCII, UTF8-encoded char. */
3969#if (SIZEOF_LONG == 8)
3970# define ASCII_CHAR_MASK 0x8080808080808080L
3971#elif (SIZEOF_LONG == 4)
3972# define ASCII_CHAR_MASK 0x80808080L
3973#else
3974# error C 'long' size should be either 4 or 8!
3975#endif
3976
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003977/* Scans a UTF-8 string and returns the maximum character to be expected,
3978 the size of the decoded unicode string and if any major errors were
3979 encountered.
3980
3981 This function does check basic UTF-8 sanity, it does however NOT CHECK
3982 if the string contains surrogates, and if all continuation bytes are
3983 within the correct ranges, these checks are performed in
3984 PyUnicode_DecodeUTF8Stateful.
3985
3986 If it sets has_errors to 1, it means the value of unicode_size and max_char
3987 will be bogus and you should not rely on useful information in them.
3988 */
3989static Py_UCS4
3990utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3991 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3992 int *has_errors)
3993{
3994 Py_ssize_t n;
3995 Py_ssize_t char_count = 0;
3996 Py_UCS4 max_char = 127, new_max;
3997 Py_UCS4 upper_bound;
3998 const unsigned char *p = (const unsigned char *)s;
3999 const unsigned char *end = p + string_size;
4000 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
4001 int err = 0;
4002
4003 for (; p < end && !err; ++p, ++char_count) {
4004 /* Only check value if it's not a ASCII char... */
4005 if (*p < 0x80) {
4006 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4007 an explanation. */
4008 if (!((size_t) p & LONG_PTR_MASK)) {
4009 /* Help register allocation */
4010 register const unsigned char *_p = p;
4011 while (_p < aligned_end) {
4012 unsigned long value = *(unsigned long *) _p;
4013 if (value & ASCII_CHAR_MASK)
4014 break;
4015 _p += SIZEOF_LONG;
4016 char_count += SIZEOF_LONG;
4017 }
4018 p = _p;
4019 if (p == end)
4020 break;
4021 }
4022 }
4023 if (*p >= 0x80) {
4024 n = utf8_code_length[*p];
4025 new_max = max_char;
4026 switch (n) {
4027 /* invalid start byte */
4028 case 0:
4029 err = 1;
4030 break;
4031 case 2:
4032 /* Code points between 0x00FF and 0x07FF inclusive.
4033 Approximate the upper bound of the code point,
4034 if this flips over 255 we can be sure it will be more
4035 than 255 and the string will need 2 bytes per code coint,
4036 if it stays under or equal to 255, we can be sure 1 byte
4037 is enough.
4038 ((*p & 0b00011111) << 6) | 0b00111111 */
4039 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
4040 if (max_char < upper_bound)
4041 new_max = upper_bound;
4042 /* Ensure we track at least that we left ASCII space. */
4043 if (new_max < 128)
4044 new_max = 128;
4045 break;
4046 case 3:
4047 /* Between 0x0FFF and 0xFFFF inclusive, so values are
4048 always > 255 and <= 65535 and will always need 2 bytes. */
4049 if (max_char < 65535)
4050 new_max = 65535;
4051 break;
4052 case 4:
4053 /* Code point will be above 0xFFFF for sure in this case. */
4054 new_max = 65537;
4055 break;
4056 /* Internal error, this should be caught by the first if */
4057 case 1:
4058 default:
4059 assert(0 && "Impossible case in utf8_max_char_and_size");
4060 err = 1;
4061 }
4062 /* Instead of number of overall bytes for this code point,
Georg Brandl7597add2011-10-05 16:36:47 +02004063 n contains the number of following bytes: */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004064 --n;
4065 /* Check if the follow up chars are all valid continuation bytes */
4066 if (n >= 1) {
4067 const unsigned char *cont;
4068 if ((p + n) >= end) {
4069 if (consumed == 0)
4070 /* incomplete data, non-incremental decoding */
4071 err = 1;
4072 break;
4073 }
4074 for (cont = p + 1; cont < (p + n); ++cont) {
4075 if ((*cont & 0xc0) != 0x80) {
4076 err = 1;
4077 break;
4078 }
4079 }
4080 p += n;
4081 }
4082 else
4083 err = 1;
4084 max_char = new_max;
4085 }
4086 }
4087
4088 if (unicode_size)
4089 *unicode_size = char_count;
4090 if (has_errors)
4091 *has_errors = err;
4092 return max_char;
4093}
4094
4095/* Similar to PyUnicode_WRITE but can also write into wstr field
4096 of the legacy unicode representation */
4097#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
4098 do { \
4099 const int k_ = (kind); \
4100 if (k_ == PyUnicode_WCHAR_KIND) \
4101 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
4102 else if (k_ == PyUnicode_1BYTE_KIND) \
4103 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
4104 else if (k_ == PyUnicode_2BYTE_KIND) \
4105 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
4106 else \
4107 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
4108 } while (0)
4109
Alexander Belopolsky40018472011-02-26 01:02:56 +00004110PyObject *
4111PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004112 Py_ssize_t size,
4113 const char *errors,
4114 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004115{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004116 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004117 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004118 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004119 Py_ssize_t startinpos;
4120 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00004121 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004122 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004123 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004124 PyObject *errorHandler = NULL;
4125 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004126 Py_UCS4 maxchar = 0;
4127 Py_ssize_t unicode_size;
4128 Py_ssize_t i;
4129 int kind;
4130 void *data;
4131 int has_errors;
4132 Py_UNICODE *error_outptr;
4133#if SIZEOF_WCHAR_T == 2
4134 Py_ssize_t wchar_offset = 0;
4135#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004136
Walter Dörwald69652032004-09-07 20:24:22 +00004137 if (size == 0) {
4138 if (consumed)
4139 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004140 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00004141 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004142 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
4143 consumed, &has_errors);
4144 if (has_errors) {
4145 unicode = _PyUnicode_New(size);
4146 if (!unicode)
4147 return NULL;
4148 kind = PyUnicode_WCHAR_KIND;
4149 data = PyUnicode_AS_UNICODE(unicode);
4150 assert(data != NULL);
4151 }
4152 else {
4153 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
4154 if (!unicode)
4155 return NULL;
4156 /* When the string is ASCII only, just use memcpy and return.
4157 unicode_size may be != size if there is an incomplete UTF-8
4158 sequence at the end of the ASCII block. */
4159 if (maxchar < 128 && size == unicode_size) {
4160 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
4161 return (PyObject *)unicode;
4162 }
4163 kind = PyUnicode_KIND(unicode);
4164 data = PyUnicode_DATA(unicode);
4165 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004166 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004167 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004168 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00004169 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004170
4171 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004172 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004173
4174 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004175 /* Fast path for runs of ASCII characters. Given that common UTF-8
4176 input will consist of an overwhelming majority of ASCII
4177 characters, we try to optimize for this case by checking
4178 as many characters as a C 'long' can contain.
4179 First, check if we can do an aligned read, as most CPUs have
4180 a penalty for unaligned reads.
4181 */
4182 if (!((size_t) s & LONG_PTR_MASK)) {
4183 /* Help register allocation */
4184 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004185 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004186 while (_s < aligned_end) {
4187 /* Read a whole long at a time (either 4 or 8 bytes),
4188 and do a fast unrolled copy if it only contains ASCII
4189 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004190 unsigned long value = *(unsigned long *) _s;
4191 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004192 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004193 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
4194 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
4195 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
4196 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004197#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004198 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
4199 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
4200 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
4201 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004202#endif
4203 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004204 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004205 }
4206 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004207 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004208 if (s == e)
4209 break;
4210 ch = (unsigned char)*s;
4211 }
4212 }
4213
4214 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004215 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004216 s++;
4217 continue;
4218 }
4219
4220 n = utf8_code_length[ch];
4221
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004222 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004223 if (consumed)
4224 break;
4225 else {
4226 errmsg = "unexpected end of data";
4227 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004228 endinpos = startinpos+1;
4229 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4230 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004231 goto utf8Error;
4232 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004233 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004234
4235 switch (n) {
4236
4237 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004238 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004239 startinpos = s-starts;
4240 endinpos = startinpos+1;
4241 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004242
4243 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004244 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004245 startinpos = s-starts;
4246 endinpos = startinpos+1;
4247 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004248
4249 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004250 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004251 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004252 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004253 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004254 goto utf8Error;
4255 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004256 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004257 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004258 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004259 break;
4260
4261 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004262 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4263 will result in surrogates in range d800-dfff. Surrogates are
4264 not valid UTF-8 so they are rejected.
4265 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4266 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004267 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004268 (s[2] & 0xc0) != 0x80 ||
4269 ((unsigned char)s[0] == 0xE0 &&
4270 (unsigned char)s[1] < 0xA0) ||
4271 ((unsigned char)s[0] == 0xED &&
4272 (unsigned char)s[1] > 0x9F)) {
4273 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004274 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004275 endinpos = startinpos + 1;
4276
4277 /* if s[1] first two bits are 1 and 0, then the invalid
4278 continuation byte is s[2], so increment endinpos by 1,
4279 if not, s[1] is invalid and endinpos doesn't need to
4280 be incremented. */
4281 if ((s[1] & 0xC0) == 0x80)
4282 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004283 goto utf8Error;
4284 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004285 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004286 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004287 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004288 break;
4289
4290 case 4:
4291 if ((s[1] & 0xc0) != 0x80 ||
4292 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004293 (s[3] & 0xc0) != 0x80 ||
4294 ((unsigned char)s[0] == 0xF0 &&
4295 (unsigned char)s[1] < 0x90) ||
4296 ((unsigned char)s[0] == 0xF4 &&
4297 (unsigned char)s[1] > 0x8F)) {
4298 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004299 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004300 endinpos = startinpos + 1;
4301 if ((s[1] & 0xC0) == 0x80) {
4302 endinpos++;
4303 if ((s[2] & 0xC0) == 0x80)
4304 endinpos++;
4305 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004306 goto utf8Error;
4307 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004308 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004309 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4310 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4311
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004312 /* If the string is flexible or we have native UCS-4, write
4313 directly.. */
4314 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
4315 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00004316
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004317 else {
4318 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00004319
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004320 /* translate from 10000..10FFFF to 0..FFFF */
4321 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00004322
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004323 /* high surrogate = top 10 bits added to D800 */
4324 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4325 (Py_UNICODE)(0xD800 + (ch >> 10)));
4326
4327 /* low surrogate = bottom 10 bits added to DC00 */
4328 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4329 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
4330 }
4331#if SIZEOF_WCHAR_T == 2
4332 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004333#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004334 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004335 }
4336 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004337 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004338
Benjamin Peterson29060642009-01-31 22:14:21 +00004339 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004340 /* If this is not yet a resizable string, make it one.. */
4341 if (kind != PyUnicode_WCHAR_KIND) {
4342 const Py_UNICODE *u;
4343 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
4344 if (!new_unicode)
4345 goto onError;
4346 u = PyUnicode_AsUnicode((PyObject *)unicode);
4347 if (!u)
4348 goto onError;
4349#if SIZEOF_WCHAR_T == 2
4350 i += wchar_offset;
4351#endif
4352 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
4353 Py_DECREF(unicode);
4354 unicode = new_unicode;
4355 kind = 0;
4356 data = PyUnicode_AS_UNICODE(new_unicode);
4357 assert(data != NULL);
4358 }
4359 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00004360 if (unicode_decode_call_errorhandler(
4361 errors, &errorHandler,
4362 "utf8", errmsg,
4363 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004364 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00004365 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004366 /* Update data because unicode_decode_call_errorhandler might have
4367 re-created or resized the unicode object. */
4368 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004369 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004370 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004371 /* Ensure the unicode_size calculation above was correct: */
4372 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
4373
Walter Dörwald69652032004-09-07 20:24:22 +00004374 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004375 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004376
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004377 /* Adjust length and ready string when it contained errors and
4378 is of the old resizable kind. */
4379 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004380 if (PyUnicode_Resize((PyObject**)&unicode, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004381 goto onError;
4382 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004383
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004384 Py_XDECREF(errorHandler);
4385 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02004386#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004387 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004388 Py_DECREF(unicode);
4389 return NULL;
4390 }
Victor Stinner17efeed2011-10-04 20:05:46 +02004391#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004392 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004393 return (PyObject *)unicode;
4394
Benjamin Peterson29060642009-01-31 22:14:21 +00004395 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004396 Py_XDECREF(errorHandler);
4397 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004398 Py_DECREF(unicode);
4399 return NULL;
4400}
4401
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004402#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00004403
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004404#ifdef __APPLE__
4405
4406/* Simplified UTF-8 decoder using surrogateescape error handler,
4407 used to decode the command line arguments on Mac OS X. */
4408
4409wchar_t*
4410_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4411{
4412 int n;
4413 const char *e;
4414 wchar_t *unicode, *p;
4415
4416 /* Note: size will always be longer than the resulting Unicode
4417 character count */
4418 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4419 PyErr_NoMemory();
4420 return NULL;
4421 }
4422 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4423 if (!unicode)
4424 return NULL;
4425
4426 /* Unpack UTF-8 encoded data */
4427 p = unicode;
4428 e = s + size;
4429 while (s < e) {
4430 Py_UCS4 ch = (unsigned char)*s;
4431
4432 if (ch < 0x80) {
4433 *p++ = (wchar_t)ch;
4434 s++;
4435 continue;
4436 }
4437
4438 n = utf8_code_length[ch];
4439 if (s + n > e) {
4440 goto surrogateescape;
4441 }
4442
4443 switch (n) {
4444 case 0:
4445 case 1:
4446 goto surrogateescape;
4447
4448 case 2:
4449 if ((s[1] & 0xc0) != 0x80)
4450 goto surrogateescape;
4451 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4452 assert ((ch > 0x007F) && (ch <= 0x07FF));
4453 *p++ = (wchar_t)ch;
4454 break;
4455
4456 case 3:
4457 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4458 will result in surrogates in range d800-dfff. Surrogates are
4459 not valid UTF-8 so they are rejected.
4460 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4461 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4462 if ((s[1] & 0xc0) != 0x80 ||
4463 (s[2] & 0xc0) != 0x80 ||
4464 ((unsigned char)s[0] == 0xE0 &&
4465 (unsigned char)s[1] < 0xA0) ||
4466 ((unsigned char)s[0] == 0xED &&
4467 (unsigned char)s[1] > 0x9F)) {
4468
4469 goto surrogateescape;
4470 }
4471 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4472 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004473 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004474 break;
4475
4476 case 4:
4477 if ((s[1] & 0xc0) != 0x80 ||
4478 (s[2] & 0xc0) != 0x80 ||
4479 (s[3] & 0xc0) != 0x80 ||
4480 ((unsigned char)s[0] == 0xF0 &&
4481 (unsigned char)s[1] < 0x90) ||
4482 ((unsigned char)s[0] == 0xF4 &&
4483 (unsigned char)s[1] > 0x8F)) {
4484 goto surrogateescape;
4485 }
4486 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4487 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4488 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4489
4490#if SIZEOF_WCHAR_T == 4
4491 *p++ = (wchar_t)ch;
4492#else
4493 /* compute and append the two surrogates: */
4494
4495 /* translate from 10000..10FFFF to 0..FFFF */
4496 ch -= 0x10000;
4497
4498 /* high surrogate = top 10 bits added to D800 */
4499 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4500
4501 /* low surrogate = bottom 10 bits added to DC00 */
4502 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4503#endif
4504 break;
4505 }
4506 s += n;
4507 continue;
4508
4509 surrogateescape:
4510 *p++ = 0xDC00 + ch;
4511 s++;
4512 }
4513 *p = L'\0';
4514 return unicode;
4515}
4516
4517#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004518
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004519/* Primary internal function which creates utf8 encoded bytes objects.
4520
4521 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004522 and allocate exactly as much space needed at the end. Else allocate the
4523 maximum possible needed (4 result bytes per Unicode character), and return
4524 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004525*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004526PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004527_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004528{
Tim Peters602f7402002-04-27 18:03:26 +00004529#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004530
Guido van Rossum98297ee2007-11-06 21:34:58 +00004531 Py_ssize_t i; /* index into s of next input byte */
4532 PyObject *result; /* result string object */
4533 char *p; /* next free byte in output buffer */
4534 Py_ssize_t nallocated; /* number of result bytes allocated */
4535 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004536 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004537 PyObject *errorHandler = NULL;
4538 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004539 int kind;
4540 void *data;
4541 Py_ssize_t size;
4542 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4543#if SIZEOF_WCHAR_T == 2
4544 Py_ssize_t wchar_offset = 0;
4545#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004546
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004547 if (!PyUnicode_Check(unicode)) {
4548 PyErr_BadArgument();
4549 return NULL;
4550 }
4551
4552 if (PyUnicode_READY(unicode) == -1)
4553 return NULL;
4554
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004555 if (PyUnicode_UTF8(unicode))
4556 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4557 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004558
4559 kind = PyUnicode_KIND(unicode);
4560 data = PyUnicode_DATA(unicode);
4561 size = PyUnicode_GET_LENGTH(unicode);
4562
Tim Peters602f7402002-04-27 18:03:26 +00004563 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004564
Tim Peters602f7402002-04-27 18:03:26 +00004565 if (size <= MAX_SHORT_UNICHARS) {
4566 /* Write into the stack buffer; nallocated can't overflow.
4567 * At the end, we'll allocate exactly as much heap space as it
4568 * turns out we need.
4569 */
4570 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004571 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004572 p = stackbuf;
4573 }
4574 else {
4575 /* Overallocate on the heap, and give the excess back at the end. */
4576 nallocated = size * 4;
4577 if (nallocated / 4 != size) /* overflow! */
4578 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004579 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004580 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004581 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004582 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004583 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004584
Tim Peters602f7402002-04-27 18:03:26 +00004585 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004586 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004587
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004588 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004589 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004590 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004591
Guido van Rossumd57fd912000-03-10 22:53:23 +00004592 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004593 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004594 *p++ = (char)(0xc0 | (ch >> 6));
4595 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004596 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004597 Py_ssize_t newpos;
4598 PyObject *rep;
4599 Py_ssize_t repsize, k, startpos;
4600 startpos = i-1;
4601#if SIZEOF_WCHAR_T == 2
4602 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004603#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004604 rep = unicode_encode_call_errorhandler(
4605 errors, &errorHandler, "utf-8", "surrogates not allowed",
4606 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4607 &exc, startpos, startpos+1, &newpos);
4608 if (!rep)
4609 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004610
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004611 if (PyBytes_Check(rep))
4612 repsize = PyBytes_GET_SIZE(rep);
4613 else
4614 repsize = PyUnicode_GET_SIZE(rep);
4615
4616 if (repsize > 4) {
4617 Py_ssize_t offset;
4618
4619 if (result == NULL)
4620 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004621 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004622 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004623
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004624 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4625 /* integer overflow */
4626 PyErr_NoMemory();
4627 goto error;
4628 }
4629 nallocated += repsize - 4;
4630 if (result != NULL) {
4631 if (_PyBytes_Resize(&result, nallocated) < 0)
4632 goto error;
4633 } else {
4634 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004635 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004636 goto error;
4637 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4638 }
4639 p = PyBytes_AS_STRING(result) + offset;
4640 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004641
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004642 if (PyBytes_Check(rep)) {
4643 char *prep = PyBytes_AS_STRING(rep);
4644 for(k = repsize; k > 0; k--)
4645 *p++ = *prep++;
4646 } else /* rep is unicode */ {
4647 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4648 Py_UNICODE c;
4649
4650 for(k=0; k<repsize; k++) {
4651 c = prep[k];
4652 if (0x80 <= c) {
4653 raise_encode_exception(&exc, "utf-8",
4654 PyUnicode_AS_UNICODE(unicode),
4655 size, i-1, i,
4656 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004657 goto error;
4658 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004659 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004660 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004661 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004662 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004663 } else if (ch < 0x10000) {
4664 *p++ = (char)(0xe0 | (ch >> 12));
4665 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4666 *p++ = (char)(0x80 | (ch & 0x3f));
4667 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004668 /* Encode UCS4 Unicode ordinals */
4669 *p++ = (char)(0xf0 | (ch >> 18));
4670 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4671 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4672 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004673#if SIZEOF_WCHAR_T == 2
4674 wchar_offset++;
4675#endif
Tim Peters602f7402002-04-27 18:03:26 +00004676 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004677 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004678
Guido van Rossum98297ee2007-11-06 21:34:58 +00004679 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004680 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004681 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004682 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004683 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004684 }
4685 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004686 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004687 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004688 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004689 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004690 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004691
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004692 Py_XDECREF(errorHandler);
4693 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004694 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004695 error:
4696 Py_XDECREF(errorHandler);
4697 Py_XDECREF(exc);
4698 Py_XDECREF(result);
4699 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004700
Tim Peters602f7402002-04-27 18:03:26 +00004701#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004702}
4703
Alexander Belopolsky40018472011-02-26 01:02:56 +00004704PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004705PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4706 Py_ssize_t size,
4707 const char *errors)
4708{
4709 PyObject *v, *unicode;
4710
4711 unicode = PyUnicode_FromUnicode(s, size);
4712 if (unicode == NULL)
4713 return NULL;
4714 v = _PyUnicode_AsUTF8String(unicode, errors);
4715 Py_DECREF(unicode);
4716 return v;
4717}
4718
4719PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004720PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004721{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004722 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004723}
4724
Walter Dörwald41980ca2007-08-16 21:55:45 +00004725/* --- UTF-32 Codec ------------------------------------------------------- */
4726
4727PyObject *
4728PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004729 Py_ssize_t size,
4730 const char *errors,
4731 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004732{
4733 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4734}
4735
4736PyObject *
4737PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004738 Py_ssize_t size,
4739 const char *errors,
4740 int *byteorder,
4741 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004742{
4743 const char *starts = s;
4744 Py_ssize_t startinpos;
4745 Py_ssize_t endinpos;
4746 Py_ssize_t outpos;
4747 PyUnicodeObject *unicode;
4748 Py_UNICODE *p;
4749#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004750 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004751 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004752#else
4753 const int pairs = 0;
4754#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004755 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004756 int bo = 0; /* assume native ordering by default */
4757 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004758 /* Offsets from q for retrieving bytes in the right order. */
4759#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4760 int iorder[] = {0, 1, 2, 3};
4761#else
4762 int iorder[] = {3, 2, 1, 0};
4763#endif
4764 PyObject *errorHandler = NULL;
4765 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004766
Walter Dörwald41980ca2007-08-16 21:55:45 +00004767 q = (unsigned char *)s;
4768 e = q + size;
4769
4770 if (byteorder)
4771 bo = *byteorder;
4772
4773 /* Check for BOM marks (U+FEFF) in the input and adjust current
4774 byte order setting accordingly. In native mode, the leading BOM
4775 mark is skipped, in all other modes, it is copied to the output
4776 stream as-is (giving a ZWNBSP character). */
4777 if (bo == 0) {
4778 if (size >= 4) {
4779 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004780 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004781#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004782 if (bom == 0x0000FEFF) {
4783 q += 4;
4784 bo = -1;
4785 }
4786 else if (bom == 0xFFFE0000) {
4787 q += 4;
4788 bo = 1;
4789 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004790#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004791 if (bom == 0x0000FEFF) {
4792 q += 4;
4793 bo = 1;
4794 }
4795 else if (bom == 0xFFFE0000) {
4796 q += 4;
4797 bo = -1;
4798 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004799#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004800 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004801 }
4802
4803 if (bo == -1) {
4804 /* force LE */
4805 iorder[0] = 0;
4806 iorder[1] = 1;
4807 iorder[2] = 2;
4808 iorder[3] = 3;
4809 }
4810 else if (bo == 1) {
4811 /* force BE */
4812 iorder[0] = 3;
4813 iorder[1] = 2;
4814 iorder[2] = 1;
4815 iorder[3] = 0;
4816 }
4817
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004818 /* On narrow builds we split characters outside the BMP into two
4819 codepoints => count how much extra space we need. */
4820#ifndef Py_UNICODE_WIDE
4821 for (qq = q; qq < e; qq += 4)
4822 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4823 pairs++;
4824#endif
4825
4826 /* This might be one to much, because of a BOM */
4827 unicode = _PyUnicode_New((size+3)/4+pairs);
4828 if (!unicode)
4829 return NULL;
4830 if (size == 0)
4831 return (PyObject *)unicode;
4832
4833 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004834 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004835
Walter Dörwald41980ca2007-08-16 21:55:45 +00004836 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004837 Py_UCS4 ch;
4838 /* remaining bytes at the end? (size should be divisible by 4) */
4839 if (e-q<4) {
4840 if (consumed)
4841 break;
4842 errmsg = "truncated data";
4843 startinpos = ((const char *)q)-starts;
4844 endinpos = ((const char *)e)-starts;
4845 goto utf32Error;
4846 /* The remaining input chars are ignored if the callback
4847 chooses to skip the input */
4848 }
4849 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4850 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004851
Benjamin Peterson29060642009-01-31 22:14:21 +00004852 if (ch >= 0x110000)
4853 {
4854 errmsg = "codepoint not in range(0x110000)";
4855 startinpos = ((const char *)q)-starts;
4856 endinpos = startinpos+4;
4857 goto utf32Error;
4858 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004859#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004860 if (ch >= 0x10000)
4861 {
4862 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4863 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4864 }
4865 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004866#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004867 *p++ = ch;
4868 q += 4;
4869 continue;
4870 utf32Error:
4871 outpos = p-PyUnicode_AS_UNICODE(unicode);
4872 if (unicode_decode_call_errorhandler(
4873 errors, &errorHandler,
4874 "utf32", errmsg,
4875 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4876 &unicode, &outpos, &p))
4877 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004878 }
4879
4880 if (byteorder)
4881 *byteorder = bo;
4882
4883 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004884 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004885
4886 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02004887 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004888 goto onError;
4889
4890 Py_XDECREF(errorHandler);
4891 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02004892#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004893 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004894 Py_DECREF(unicode);
4895 return NULL;
4896 }
Victor Stinner17efeed2011-10-04 20:05:46 +02004897#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004898 assert(_PyUnicode_CheckConsistency(unicode, 1));
Walter Dörwald41980ca2007-08-16 21:55:45 +00004899 return (PyObject *)unicode;
4900
Benjamin Peterson29060642009-01-31 22:14:21 +00004901 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004902 Py_DECREF(unicode);
4903 Py_XDECREF(errorHandler);
4904 Py_XDECREF(exc);
4905 return NULL;
4906}
4907
4908PyObject *
4909PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004910 Py_ssize_t size,
4911 const char *errors,
4912 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004913{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004914 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004915 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004916 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004917#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004918 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004919#else
4920 const int pairs = 0;
4921#endif
4922 /* Offsets from p for storing byte pairs in the right order. */
4923#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4924 int iorder[] = {0, 1, 2, 3};
4925#else
4926 int iorder[] = {3, 2, 1, 0};
4927#endif
4928
Benjamin Peterson29060642009-01-31 22:14:21 +00004929#define STORECHAR(CH) \
4930 do { \
4931 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4932 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4933 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4934 p[iorder[0]] = (CH) & 0xff; \
4935 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004936 } while(0)
4937
4938 /* In narrow builds we can output surrogate pairs as one codepoint,
4939 so we need less space. */
4940#ifndef Py_UNICODE_WIDE
4941 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004942 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4943 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4944 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004945#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004946 nsize = (size - pairs + (byteorder == 0));
4947 bytesize = nsize * 4;
4948 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004949 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004950 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004951 if (v == NULL)
4952 return NULL;
4953
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004954 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004955 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004956 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004957 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004958 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004959
4960 if (byteorder == -1) {
4961 /* force LE */
4962 iorder[0] = 0;
4963 iorder[1] = 1;
4964 iorder[2] = 2;
4965 iorder[3] = 3;
4966 }
4967 else if (byteorder == 1) {
4968 /* force BE */
4969 iorder[0] = 3;
4970 iorder[1] = 2;
4971 iorder[2] = 1;
4972 iorder[3] = 0;
4973 }
4974
4975 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004976 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004977#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004978 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4979 Py_UCS4 ch2 = *s;
4980 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4981 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4982 s++;
4983 size--;
4984 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004985 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004986#endif
4987 STORECHAR(ch);
4988 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004989
4990 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004991 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004992#undef STORECHAR
4993}
4994
Alexander Belopolsky40018472011-02-26 01:02:56 +00004995PyObject *
4996PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004997{
4998 if (!PyUnicode_Check(unicode)) {
4999 PyErr_BadArgument();
5000 return NULL;
5001 }
5002 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005003 PyUnicode_GET_SIZE(unicode),
5004 NULL,
5005 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005006}
5007
Guido van Rossumd57fd912000-03-10 22:53:23 +00005008/* --- UTF-16 Codec ------------------------------------------------------- */
5009
Tim Peters772747b2001-08-09 22:21:55 +00005010PyObject *
5011PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005012 Py_ssize_t size,
5013 const char *errors,
5014 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005015{
Walter Dörwald69652032004-09-07 20:24:22 +00005016 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5017}
5018
Antoine Pitrouab868312009-01-10 15:40:25 +00005019/* Two masks for fast checking of whether a C 'long' may contain
5020 UTF16-encoded surrogate characters. This is an efficient heuristic,
5021 assuming that non-surrogate characters with a code point >= 0x8000 are
5022 rare in most input.
5023 FAST_CHAR_MASK is used when the input is in native byte ordering,
5024 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005025*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005026#if (SIZEOF_LONG == 8)
5027# define FAST_CHAR_MASK 0x8000800080008000L
5028# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
5029#elif (SIZEOF_LONG == 4)
5030# define FAST_CHAR_MASK 0x80008000L
5031# define SWAPPED_FAST_CHAR_MASK 0x00800080L
5032#else
5033# error C 'long' size should be either 4 or 8!
5034#endif
5035
Walter Dörwald69652032004-09-07 20:24:22 +00005036PyObject *
5037PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005038 Py_ssize_t size,
5039 const char *errors,
5040 int *byteorder,
5041 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005042{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005043 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005044 Py_ssize_t startinpos;
5045 Py_ssize_t endinpos;
5046 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005047 PyUnicodeObject *unicode;
5048 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00005049 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005050 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005051 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005052 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005053 /* Offsets from q for retrieving byte pairs in the right order. */
5054#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5055 int ihi = 1, ilo = 0;
5056#else
5057 int ihi = 0, ilo = 1;
5058#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005059 PyObject *errorHandler = NULL;
5060 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005061
5062 /* Note: size will always be longer than the resulting Unicode
5063 character count */
5064 unicode = _PyUnicode_New(size);
5065 if (!unicode)
5066 return NULL;
5067 if (size == 0)
5068 return (PyObject *)unicode;
5069
5070 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005071 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00005072 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005073 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005074
5075 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005076 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005077
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005078 /* Check for BOM marks (U+FEFF) in the input and adjust current
5079 byte order setting accordingly. In native mode, the leading BOM
5080 mark is skipped, in all other modes, it is copied to the output
5081 stream as-is (giving a ZWNBSP character). */
5082 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005083 if (size >= 2) {
5084 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005085#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005086 if (bom == 0xFEFF) {
5087 q += 2;
5088 bo = -1;
5089 }
5090 else if (bom == 0xFFFE) {
5091 q += 2;
5092 bo = 1;
5093 }
Tim Petersced69f82003-09-16 20:30:58 +00005094#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005095 if (bom == 0xFEFF) {
5096 q += 2;
5097 bo = 1;
5098 }
5099 else if (bom == 0xFFFE) {
5100 q += 2;
5101 bo = -1;
5102 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005103#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005104 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005105 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005106
Tim Peters772747b2001-08-09 22:21:55 +00005107 if (bo == -1) {
5108 /* force LE */
5109 ihi = 1;
5110 ilo = 0;
5111 }
5112 else if (bo == 1) {
5113 /* force BE */
5114 ihi = 0;
5115 ilo = 1;
5116 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005117#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5118 native_ordering = ilo < ihi;
5119#else
5120 native_ordering = ilo > ihi;
5121#endif
Tim Peters772747b2001-08-09 22:21:55 +00005122
Antoine Pitrouab868312009-01-10 15:40:25 +00005123 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005124 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005125 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005126 /* First check for possible aligned read of a C 'long'. Unaligned
5127 reads are more expensive, better to defer to another iteration. */
5128 if (!((size_t) q & LONG_PTR_MASK)) {
5129 /* Fast path for runs of non-surrogate chars. */
5130 register const unsigned char *_q = q;
5131 Py_UNICODE *_p = p;
5132 if (native_ordering) {
5133 /* Native ordering is simple: as long as the input cannot
5134 possibly contain a surrogate char, do an unrolled copy
5135 of several 16-bit code points to the target object.
5136 The non-surrogate check is done on several input bytes
5137 at a time (as many as a C 'long' can contain). */
5138 while (_q < aligned_end) {
5139 unsigned long data = * (unsigned long *) _q;
5140 if (data & FAST_CHAR_MASK)
5141 break;
5142 _p[0] = ((unsigned short *) _q)[0];
5143 _p[1] = ((unsigned short *) _q)[1];
5144#if (SIZEOF_LONG == 8)
5145 _p[2] = ((unsigned short *) _q)[2];
5146 _p[3] = ((unsigned short *) _q)[3];
5147#endif
5148 _q += SIZEOF_LONG;
5149 _p += SIZEOF_LONG / 2;
5150 }
5151 }
5152 else {
5153 /* Byteswapped ordering is similar, but we must decompose
5154 the copy bytewise, and take care of zero'ing out the
5155 upper bytes if the target object is in 32-bit units
5156 (that is, in UCS-4 builds). */
5157 while (_q < aligned_end) {
5158 unsigned long data = * (unsigned long *) _q;
5159 if (data & SWAPPED_FAST_CHAR_MASK)
5160 break;
5161 /* Zero upper bytes in UCS-4 builds */
5162#if (Py_UNICODE_SIZE > 2)
5163 _p[0] = 0;
5164 _p[1] = 0;
5165#if (SIZEOF_LONG == 8)
5166 _p[2] = 0;
5167 _p[3] = 0;
5168#endif
5169#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005170 /* Issue #4916; UCS-4 builds on big endian machines must
5171 fill the two last bytes of each 4-byte unit. */
5172#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
5173# define OFF 2
5174#else
5175# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00005176#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005177 ((unsigned char *) _p)[OFF + 1] = _q[0];
5178 ((unsigned char *) _p)[OFF + 0] = _q[1];
5179 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
5180 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
5181#if (SIZEOF_LONG == 8)
5182 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
5183 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
5184 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
5185 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
5186#endif
5187#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00005188 _q += SIZEOF_LONG;
5189 _p += SIZEOF_LONG / 2;
5190 }
5191 }
5192 p = _p;
5193 q = _q;
5194 if (q >= e)
5195 break;
5196 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005197 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005198
Benjamin Peterson14339b62009-01-31 16:36:08 +00005199 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005200
5201 if (ch < 0xD800 || ch > 0xDFFF) {
5202 *p++ = ch;
5203 continue;
5204 }
5205
5206 /* UTF-16 code pair: */
5207 if (q > e) {
5208 errmsg = "unexpected end of data";
5209 startinpos = (((const char *)q) - 2) - starts;
5210 endinpos = ((const char *)e) + 1 - starts;
5211 goto utf16Error;
5212 }
5213 if (0xD800 <= ch && ch <= 0xDBFF) {
5214 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
5215 q += 2;
5216 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00005217#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005218 *p++ = ch;
5219 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005220#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005221 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005222#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005223 continue;
5224 }
5225 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005226 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005227 startinpos = (((const char *)q)-4)-starts;
5228 endinpos = startinpos+2;
5229 goto utf16Error;
5230 }
5231
Benjamin Peterson14339b62009-01-31 16:36:08 +00005232 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005233 errmsg = "illegal encoding";
5234 startinpos = (((const char *)q)-2)-starts;
5235 endinpos = startinpos+2;
5236 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005237
Benjamin Peterson29060642009-01-31 22:14:21 +00005238 utf16Error:
5239 outpos = p - PyUnicode_AS_UNICODE(unicode);
5240 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005241 errors,
5242 &errorHandler,
5243 "utf16", errmsg,
5244 &starts,
5245 (const char **)&e,
5246 &startinpos,
5247 &endinpos,
5248 &exc,
5249 (const char **)&q,
5250 &unicode,
5251 &outpos,
5252 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00005253 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005254 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005255 /* remaining byte at the end? (size should be even) */
5256 if (e == q) {
5257 if (!consumed) {
5258 errmsg = "truncated data";
5259 startinpos = ((const char *)q) - starts;
5260 endinpos = ((const char *)e) + 1 - starts;
5261 outpos = p - PyUnicode_AS_UNICODE(unicode);
5262 if (unicode_decode_call_errorhandler(
5263 errors,
5264 &errorHandler,
5265 "utf16", errmsg,
5266 &starts,
5267 (const char **)&e,
5268 &startinpos,
5269 &endinpos,
5270 &exc,
5271 (const char **)&q,
5272 &unicode,
5273 &outpos,
5274 &p))
5275 goto onError;
5276 /* The remaining input chars are ignored if the callback
5277 chooses to skip the input */
5278 }
5279 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005280
5281 if (byteorder)
5282 *byteorder = bo;
5283
Walter Dörwald69652032004-09-07 20:24:22 +00005284 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005285 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005286
Guido van Rossumd57fd912000-03-10 22:53:23 +00005287 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02005288 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005289 goto onError;
5290
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005291 Py_XDECREF(errorHandler);
5292 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005293#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005294 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005295 Py_DECREF(unicode);
5296 return NULL;
5297 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005298#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005299 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005300 return (PyObject *)unicode;
5301
Benjamin Peterson29060642009-01-31 22:14:21 +00005302 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005303 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005304 Py_XDECREF(errorHandler);
5305 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005306 return NULL;
5307}
5308
Antoine Pitrouab868312009-01-10 15:40:25 +00005309#undef FAST_CHAR_MASK
5310#undef SWAPPED_FAST_CHAR_MASK
5311
Tim Peters772747b2001-08-09 22:21:55 +00005312PyObject *
5313PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005314 Py_ssize_t size,
5315 const char *errors,
5316 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005317{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005318 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005319 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005320 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005321#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005322 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005323#else
5324 const int pairs = 0;
5325#endif
Tim Peters772747b2001-08-09 22:21:55 +00005326 /* Offsets from p for storing byte pairs in the right order. */
5327#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5328 int ihi = 1, ilo = 0;
5329#else
5330 int ihi = 0, ilo = 1;
5331#endif
5332
Benjamin Peterson29060642009-01-31 22:14:21 +00005333#define STORECHAR(CH) \
5334 do { \
5335 p[ihi] = ((CH) >> 8) & 0xff; \
5336 p[ilo] = (CH) & 0xff; \
5337 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005338 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005339
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005340#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005341 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005342 if (s[i] >= 0x10000)
5343 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005344#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005345 /* 2 * (size + pairs + (byteorder == 0)) */
5346 if (size > PY_SSIZE_T_MAX ||
5347 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005348 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005349 nsize = size + pairs + (byteorder == 0);
5350 bytesize = nsize * 2;
5351 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005352 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005353 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005354 if (v == NULL)
5355 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005356
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005357 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005358 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005359 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005360 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005361 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005362
5363 if (byteorder == -1) {
5364 /* force LE */
5365 ihi = 1;
5366 ilo = 0;
5367 }
5368 else if (byteorder == 1) {
5369 /* force BE */
5370 ihi = 0;
5371 ilo = 1;
5372 }
5373
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005374 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005375 Py_UNICODE ch = *s++;
5376 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005377#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005378 if (ch >= 0x10000) {
5379 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5380 ch = 0xD800 | ((ch-0x10000) >> 10);
5381 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005382#endif
Tim Peters772747b2001-08-09 22:21:55 +00005383 STORECHAR(ch);
5384 if (ch2)
5385 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005386 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005387
5388 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005389 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005390#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005391}
5392
Alexander Belopolsky40018472011-02-26 01:02:56 +00005393PyObject *
5394PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005395{
5396 if (!PyUnicode_Check(unicode)) {
5397 PyErr_BadArgument();
5398 return NULL;
5399 }
5400 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005401 PyUnicode_GET_SIZE(unicode),
5402 NULL,
5403 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005404}
5405
5406/* --- Unicode Escape Codec ----------------------------------------------- */
5407
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005408/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5409 if all the escapes in the string make it still a valid ASCII string.
5410 Returns -1 if any escapes were found which cause the string to
5411 pop out of ASCII range. Otherwise returns the length of the
5412 required buffer to hold the string.
5413 */
5414Py_ssize_t
5415length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5416{
5417 const unsigned char *p = (const unsigned char *)s;
5418 const unsigned char *end = p + size;
5419 Py_ssize_t length = 0;
5420
5421 if (size < 0)
5422 return -1;
5423
5424 for (; p < end; ++p) {
5425 if (*p > 127) {
5426 /* Non-ASCII */
5427 return -1;
5428 }
5429 else if (*p != '\\') {
5430 /* Normal character */
5431 ++length;
5432 }
5433 else {
5434 /* Backslash-escape, check next char */
5435 ++p;
5436 /* Escape sequence reaches till end of string or
5437 non-ASCII follow-up. */
5438 if (p >= end || *p > 127)
5439 return -1;
5440 switch (*p) {
5441 case '\n':
5442 /* backslash + \n result in zero characters */
5443 break;
5444 case '\\': case '\'': case '\"':
5445 case 'b': case 'f': case 't':
5446 case 'n': case 'r': case 'v': case 'a':
5447 ++length;
5448 break;
5449 case '0': case '1': case '2': case '3':
5450 case '4': case '5': case '6': case '7':
5451 case 'x': case 'u': case 'U': case 'N':
5452 /* these do not guarantee ASCII characters */
5453 return -1;
5454 default:
5455 /* count the backslash + the other character */
5456 length += 2;
5457 }
5458 }
5459 }
5460 return length;
5461}
5462
5463/* Similar to PyUnicode_WRITE but either write into wstr field
5464 or treat string as ASCII. */
5465#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5466 do { \
5467 if ((kind) != PyUnicode_WCHAR_KIND) \
5468 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5469 else \
5470 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5471 } while (0)
5472
5473#define WRITE_WSTR(buf, index, value) \
5474 assert(kind == PyUnicode_WCHAR_KIND), \
5475 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5476
5477
Fredrik Lundh06d12682001-01-24 07:59:11 +00005478static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005479
Alexander Belopolsky40018472011-02-26 01:02:56 +00005480PyObject *
5481PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005482 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005483 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005484{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005485 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005486 Py_ssize_t startinpos;
5487 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005488 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005489 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005490 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005491 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005492 char* message;
5493 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005494 PyObject *errorHandler = NULL;
5495 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005496 Py_ssize_t ascii_length;
5497 Py_ssize_t i;
5498 int kind;
5499 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005500
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005501 ascii_length = length_of_escaped_ascii_string(s, size);
5502
5503 /* After length_of_escaped_ascii_string() there are two alternatives,
5504 either the string is pure ASCII with named escapes like \n, etc.
5505 and we determined it's exact size (common case)
5506 or it contains \x, \u, ... escape sequences. then we create a
5507 legacy wchar string and resize it at the end of this function. */
5508 if (ascii_length >= 0) {
5509 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
5510 if (!v)
5511 goto onError;
5512 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5513 kind = PyUnicode_1BYTE_KIND;
5514 data = PyUnicode_DATA(v);
5515 }
5516 else {
5517 /* Escaped strings will always be longer than the resulting
5518 Unicode string, so we start with size here and then reduce the
5519 length after conversion to the true value.
5520 (but if the error callback returns a long replacement string
5521 we'll have to allocate more space) */
5522 v = _PyUnicode_New(size);
5523 if (!v)
5524 goto onError;
5525 kind = PyUnicode_WCHAR_KIND;
5526 data = PyUnicode_AS_UNICODE(v);
5527 }
5528
Guido van Rossumd57fd912000-03-10 22:53:23 +00005529 if (size == 0)
5530 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005531 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005532 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005533
Guido van Rossumd57fd912000-03-10 22:53:23 +00005534 while (s < end) {
5535 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005536 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005537 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005538
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005539 if (kind == PyUnicode_WCHAR_KIND) {
5540 assert(i < _PyUnicode_WSTR_LENGTH(v));
5541 }
5542 else {
5543 /* The only case in which i == ascii_length is a backslash
5544 followed by a newline. */
5545 assert(i <= ascii_length);
5546 }
5547
Guido van Rossumd57fd912000-03-10 22:53:23 +00005548 /* Non-escape characters are interpreted as Unicode ordinals */
5549 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005550 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005551 continue;
5552 }
5553
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005554 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005555 /* \ - Escapes */
5556 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005557 c = *s++;
5558 if (s > end)
5559 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005560
5561 if (kind == PyUnicode_WCHAR_KIND) {
5562 assert(i < _PyUnicode_WSTR_LENGTH(v));
5563 }
5564 else {
5565 /* The only case in which i == ascii_length is a backslash
5566 followed by a newline. */
5567 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5568 }
5569
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005570 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005571
Benjamin Peterson29060642009-01-31 22:14:21 +00005572 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005573 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005574 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5575 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5576 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5577 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5578 /* FF */
5579 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5580 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5581 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5582 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5583 /* VT */
5584 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5585 /* BEL, not classic C */
5586 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005587
Benjamin Peterson29060642009-01-31 22:14:21 +00005588 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005589 case '0': case '1': case '2': case '3':
5590 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005591 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005592 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005593 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005594 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005595 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005596 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005597 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005598 break;
5599
Benjamin Peterson29060642009-01-31 22:14:21 +00005600 /* hex escapes */
5601 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005602 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005603 digits = 2;
5604 message = "truncated \\xXX escape";
5605 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005606
Benjamin Peterson29060642009-01-31 22:14:21 +00005607 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005608 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005609 digits = 4;
5610 message = "truncated \\uXXXX escape";
5611 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005612
Benjamin Peterson29060642009-01-31 22:14:21 +00005613 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005614 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005615 digits = 8;
5616 message = "truncated \\UXXXXXXXX escape";
5617 hexescape:
5618 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005619 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005620 if (s+digits>end) {
5621 endinpos = size;
5622 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005623 errors, &errorHandler,
5624 "unicodeescape", "end of string in escape sequence",
5625 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005626 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005627 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005628 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005629 goto nextByte;
5630 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005631 for (j = 0; j < digits; ++j) {
5632 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005633 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005634 endinpos = (s+j+1)-starts;
5635 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005636 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005637 errors, &errorHandler,
5638 "unicodeescape", message,
5639 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005640 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005641 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005642 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005643 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005644 }
5645 chr = (chr<<4) & ~0xF;
5646 if (c >= '0' && c <= '9')
5647 chr += c - '0';
5648 else if (c >= 'a' && c <= 'f')
5649 chr += 10 + c - 'a';
5650 else
5651 chr += 10 + c - 'A';
5652 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005653 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005654 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005655 /* _decoding_error will have already written into the
5656 target buffer. */
5657 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005658 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005659 /* when we get here, chr is a 32-bit unicode character */
5660 if (chr <= 0xffff)
5661 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005662 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005663 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005664 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005665 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005666#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005667 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005668#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005669 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005670 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5671 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005672#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005673 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005674 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005675 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005676 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005677 errors, &errorHandler,
5678 "unicodeescape", "illegal Unicode character",
5679 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005680 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005681 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005682 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005683 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005684 break;
5685
Benjamin Peterson29060642009-01-31 22:14:21 +00005686 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005687 case 'N':
5688 message = "malformed \\N character escape";
5689 if (ucnhash_CAPI == NULL) {
5690 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005691 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5692 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005693 if (ucnhash_CAPI == NULL)
5694 goto ucnhashError;
5695 }
5696 if (*s == '{') {
5697 const char *start = s+1;
5698 /* look for the closing brace */
5699 while (*s != '}' && s < end)
5700 s++;
5701 if (s > start && s < end && *s == '}') {
5702 /* found a name. look it up in the unicode database */
5703 message = "unknown Unicode character name";
5704 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005705 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5706 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005707 goto store;
5708 }
5709 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005710 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005711 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005712 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005713 errors, &errorHandler,
5714 "unicodeescape", message,
5715 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005716 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005717 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005718 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005719 break;
5720
5721 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005722 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005723 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005724 message = "\\ at end of string";
5725 s--;
5726 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005727 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005728 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005729 errors, &errorHandler,
5730 "unicodeescape", message,
5731 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005732 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005733 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005734 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005735 }
5736 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005737 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5738 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005739 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005740 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005741 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005742 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005743 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005744 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005745 /* Ensure the length prediction worked in case of ASCII strings */
5746 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5747
Victor Stinnerfe226c02011-10-03 03:52:20 +02005748 if (kind == PyUnicode_WCHAR_KIND)
5749 {
5750 if (PyUnicode_Resize((PyObject**)&v, i) < 0)
5751 goto onError;
Victor Stinnerfe226c02011-10-03 03:52:20 +02005752 }
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005753 Py_XDECREF(errorHandler);
5754 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005755#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005756 if (_PyUnicode_READY_REPLACE(&v)) {
5757 Py_DECREF(v);
5758 return NULL;
5759 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005760#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005761 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005762 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005763
Benjamin Peterson29060642009-01-31 22:14:21 +00005764 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005765 PyErr_SetString(
5766 PyExc_UnicodeError,
5767 "\\N escapes not supported (can't load unicodedata module)"
5768 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005769 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005770 Py_XDECREF(errorHandler);
5771 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005772 return NULL;
5773
Benjamin Peterson29060642009-01-31 22:14:21 +00005774 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005775 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005776 Py_XDECREF(errorHandler);
5777 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005778 return NULL;
5779}
5780
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005781#undef WRITE_ASCII_OR_WSTR
5782#undef WRITE_WSTR
5783
Guido van Rossumd57fd912000-03-10 22:53:23 +00005784/* Return a Unicode-Escape string version of the Unicode object.
5785
5786 If quotes is true, the string is enclosed in u"" or u'' quotes as
5787 appropriate.
5788
5789*/
5790
Walter Dörwald79e913e2007-05-12 11:08:06 +00005791static const char *hexdigits = "0123456789abcdef";
5792
Alexander Belopolsky40018472011-02-26 01:02:56 +00005793PyObject *
5794PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005795 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005796{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005797 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005798 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005799
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005800#ifdef Py_UNICODE_WIDE
5801 const Py_ssize_t expandsize = 10;
5802#else
5803 const Py_ssize_t expandsize = 6;
5804#endif
5805
Thomas Wouters89f507f2006-12-13 04:49:30 +00005806 /* XXX(nnorwitz): rather than over-allocating, it would be
5807 better to choose a different scheme. Perhaps scan the
5808 first N-chars of the string and allocate based on that size.
5809 */
5810 /* Initial allocation is based on the longest-possible unichr
5811 escape.
5812
5813 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5814 unichr, so in this case it's the longest unichr escape. In
5815 narrow (UTF-16) builds this is five chars per source unichr
5816 since there are two unichrs in the surrogate pair, so in narrow
5817 (UTF-16) builds it's not the longest unichr escape.
5818
5819 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5820 so in the narrow (UTF-16) build case it's the longest unichr
5821 escape.
5822 */
5823
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005824 if (size == 0)
5825 return PyBytes_FromStringAndSize(NULL, 0);
5826
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005827 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005828 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005829
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005830 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005831 2
5832 + expandsize*size
5833 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005834 if (repr == NULL)
5835 return NULL;
5836
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005837 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005838
Guido van Rossumd57fd912000-03-10 22:53:23 +00005839 while (size-- > 0) {
5840 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005841
Walter Dörwald79e913e2007-05-12 11:08:06 +00005842 /* Escape backslashes */
5843 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005844 *p++ = '\\';
5845 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005846 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005847 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005848
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005849#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005850 /* Map 21-bit characters to '\U00xxxxxx' */
5851 else if (ch >= 0x10000) {
5852 *p++ = '\\';
5853 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005854 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5855 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5856 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5857 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5858 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5859 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5860 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5861 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005862 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005863 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005864#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005865 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5866 else if (ch >= 0xD800 && ch < 0xDC00) {
5867 Py_UNICODE ch2;
5868 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005869
Benjamin Peterson29060642009-01-31 22:14:21 +00005870 ch2 = *s++;
5871 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005872 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005873 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5874 *p++ = '\\';
5875 *p++ = 'U';
5876 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5877 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5878 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5879 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5880 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5881 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5882 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5883 *p++ = hexdigits[ucs & 0x0000000F];
5884 continue;
5885 }
5886 /* Fall through: isolated surrogates are copied as-is */
5887 s--;
5888 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005889 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005890#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005891
Guido van Rossumd57fd912000-03-10 22:53:23 +00005892 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005893 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005894 *p++ = '\\';
5895 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005896 *p++ = hexdigits[(ch >> 12) & 0x000F];
5897 *p++ = hexdigits[(ch >> 8) & 0x000F];
5898 *p++ = hexdigits[(ch >> 4) & 0x000F];
5899 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005900 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005901
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005902 /* Map special whitespace to '\t', \n', '\r' */
5903 else if (ch == '\t') {
5904 *p++ = '\\';
5905 *p++ = 't';
5906 }
5907 else if (ch == '\n') {
5908 *p++ = '\\';
5909 *p++ = 'n';
5910 }
5911 else if (ch == '\r') {
5912 *p++ = '\\';
5913 *p++ = 'r';
5914 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005915
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005916 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005917 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005918 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005919 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005920 *p++ = hexdigits[(ch >> 4) & 0x000F];
5921 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005922 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005923
Guido van Rossumd57fd912000-03-10 22:53:23 +00005924 /* Copy everything else as-is */
5925 else
5926 *p++ = (char) ch;
5927 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005928
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005929 assert(p - PyBytes_AS_STRING(repr) > 0);
5930 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5931 return NULL;
5932 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005933}
5934
Alexander Belopolsky40018472011-02-26 01:02:56 +00005935PyObject *
5936PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005937{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005938 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005939 if (!PyUnicode_Check(unicode)) {
5940 PyErr_BadArgument();
5941 return NULL;
5942 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00005943 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5944 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005945 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005946}
5947
5948/* --- Raw Unicode Escape Codec ------------------------------------------- */
5949
Alexander Belopolsky40018472011-02-26 01:02:56 +00005950PyObject *
5951PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005952 Py_ssize_t size,
5953 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005954{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005955 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005956 Py_ssize_t startinpos;
5957 Py_ssize_t endinpos;
5958 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005959 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005960 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005961 const char *end;
5962 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005963 PyObject *errorHandler = NULL;
5964 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005965
Guido van Rossumd57fd912000-03-10 22:53:23 +00005966 /* Escaped strings will always be longer than the resulting
5967 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005968 length after conversion to the true value. (But decoding error
5969 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005970 v = _PyUnicode_New(size);
5971 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005972 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005973 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005974 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005975 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005976 end = s + size;
5977 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005978 unsigned char c;
5979 Py_UCS4 x;
5980 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005981 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005982
Benjamin Peterson29060642009-01-31 22:14:21 +00005983 /* Non-escape characters are interpreted as Unicode ordinals */
5984 if (*s != '\\') {
5985 *p++ = (unsigned char)*s++;
5986 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005987 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005988 startinpos = s-starts;
5989
5990 /* \u-escapes are only interpreted iff the number of leading
5991 backslashes if odd */
5992 bs = s;
5993 for (;s < end;) {
5994 if (*s != '\\')
5995 break;
5996 *p++ = (unsigned char)*s++;
5997 }
5998 if (((s - bs) & 1) == 0 ||
5999 s >= end ||
6000 (*s != 'u' && *s != 'U')) {
6001 continue;
6002 }
6003 p--;
6004 count = *s=='u' ? 4 : 8;
6005 s++;
6006
6007 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
6008 outpos = p-PyUnicode_AS_UNICODE(v);
6009 for (x = 0, i = 0; i < count; ++i, ++s) {
6010 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006011 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006012 endinpos = s-starts;
6013 if (unicode_decode_call_errorhandler(
6014 errors, &errorHandler,
6015 "rawunicodeescape", "truncated \\uXXXX",
6016 &starts, &end, &startinpos, &endinpos, &exc, &s,
6017 &v, &outpos, &p))
6018 goto onError;
6019 goto nextByte;
6020 }
6021 x = (x<<4) & ~0xF;
6022 if (c >= '0' && c <= '9')
6023 x += c - '0';
6024 else if (c >= 'a' && c <= 'f')
6025 x += 10 + c - 'a';
6026 else
6027 x += 10 + c - 'A';
6028 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00006029 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00006030 /* UCS-2 character */
6031 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006032 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006033 /* UCS-4 character. Either store directly, or as
6034 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00006035#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006036 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006037#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006038 x -= 0x10000L;
6039 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
6040 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00006041#endif
6042 } else {
6043 endinpos = s-starts;
6044 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006045 if (unicode_decode_call_errorhandler(
6046 errors, &errorHandler,
6047 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006048 &starts, &end, &startinpos, &endinpos, &exc, &s,
6049 &v, &outpos, &p))
6050 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006051 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006052 nextByte:
6053 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006054 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02006055 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006056 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006057 Py_XDECREF(errorHandler);
6058 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006059#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006060 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006061 Py_DECREF(v);
6062 return NULL;
6063 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006064#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006065 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006066 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006067
Benjamin Peterson29060642009-01-31 22:14:21 +00006068 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006069 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006070 Py_XDECREF(errorHandler);
6071 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006072 return NULL;
6073}
6074
Alexander Belopolsky40018472011-02-26 01:02:56 +00006075PyObject *
6076PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006077 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006078{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006079 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006080 char *p;
6081 char *q;
6082
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006083#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006084 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006085#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006086 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006087#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00006088
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006089 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006090 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006091
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006092 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006093 if (repr == NULL)
6094 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00006095 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006096 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006097
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006098 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006099 while (size-- > 0) {
6100 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006101#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006102 /* Map 32-bit characters to '\Uxxxxxxxx' */
6103 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006104 *p++ = '\\';
6105 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00006106 *p++ = hexdigits[(ch >> 28) & 0xf];
6107 *p++ = hexdigits[(ch >> 24) & 0xf];
6108 *p++ = hexdigits[(ch >> 20) & 0xf];
6109 *p++ = hexdigits[(ch >> 16) & 0xf];
6110 *p++ = hexdigits[(ch >> 12) & 0xf];
6111 *p++ = hexdigits[(ch >> 8) & 0xf];
6112 *p++ = hexdigits[(ch >> 4) & 0xf];
6113 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006114 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006115 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00006116#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006117 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
6118 if (ch >= 0xD800 && ch < 0xDC00) {
6119 Py_UNICODE ch2;
6120 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006121
Benjamin Peterson29060642009-01-31 22:14:21 +00006122 ch2 = *s++;
6123 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00006124 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006125 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
6126 *p++ = '\\';
6127 *p++ = 'U';
6128 *p++ = hexdigits[(ucs >> 28) & 0xf];
6129 *p++ = hexdigits[(ucs >> 24) & 0xf];
6130 *p++ = hexdigits[(ucs >> 20) & 0xf];
6131 *p++ = hexdigits[(ucs >> 16) & 0xf];
6132 *p++ = hexdigits[(ucs >> 12) & 0xf];
6133 *p++ = hexdigits[(ucs >> 8) & 0xf];
6134 *p++ = hexdigits[(ucs >> 4) & 0xf];
6135 *p++ = hexdigits[ucs & 0xf];
6136 continue;
6137 }
6138 /* Fall through: isolated surrogates are copied as-is */
6139 s--;
6140 size++;
6141 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006142#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006143 /* Map 16-bit characters to '\uxxxx' */
6144 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006145 *p++ = '\\';
6146 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00006147 *p++ = hexdigits[(ch >> 12) & 0xf];
6148 *p++ = hexdigits[(ch >> 8) & 0xf];
6149 *p++ = hexdigits[(ch >> 4) & 0xf];
6150 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006151 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006152 /* Copy everything else as-is */
6153 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006154 *p++ = (char) ch;
6155 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006156 size = p - q;
6157
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006158 assert(size > 0);
6159 if (_PyBytes_Resize(&repr, size) < 0)
6160 return NULL;
6161 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006162}
6163
Alexander Belopolsky40018472011-02-26 01:02:56 +00006164PyObject *
6165PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006166{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006167 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006168 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00006169 PyErr_BadArgument();
6170 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006171 }
Walter Dörwald711005d2007-05-12 12:03:26 +00006172 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6173 PyUnicode_GET_SIZE(unicode));
6174
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006175 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006176}
6177
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006178/* --- Unicode Internal Codec ------------------------------------------- */
6179
Alexander Belopolsky40018472011-02-26 01:02:56 +00006180PyObject *
6181_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006182 Py_ssize_t size,
6183 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006184{
6185 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006186 Py_ssize_t startinpos;
6187 Py_ssize_t endinpos;
6188 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006189 PyUnicodeObject *v;
6190 Py_UNICODE *p;
6191 const char *end;
6192 const char *reason;
6193 PyObject *errorHandler = NULL;
6194 PyObject *exc = NULL;
6195
Neal Norwitzd43069c2006-01-08 01:12:10 +00006196#ifdef Py_UNICODE_WIDE
6197 Py_UNICODE unimax = PyUnicode_GetMax();
6198#endif
6199
Thomas Wouters89f507f2006-12-13 04:49:30 +00006200 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006201 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
6202 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006203 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006204 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
6205 as string was created with the old API. */
6206 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006207 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006208 p = PyUnicode_AS_UNICODE(v);
6209 end = s + size;
6210
6211 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006212 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006213 /* We have to sanity check the raw data, otherwise doom looms for
6214 some malformed UCS-4 data. */
6215 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006216#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006217 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006218#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006219 end-s < Py_UNICODE_SIZE
6220 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006221 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006222 startinpos = s - starts;
6223 if (end-s < Py_UNICODE_SIZE) {
6224 endinpos = end-starts;
6225 reason = "truncated input";
6226 }
6227 else {
6228 endinpos = s - starts + Py_UNICODE_SIZE;
6229 reason = "illegal code point (> 0x10FFFF)";
6230 }
6231 outpos = p - PyUnicode_AS_UNICODE(v);
6232 if (unicode_decode_call_errorhandler(
6233 errors, &errorHandler,
6234 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006235 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00006236 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006237 goto onError;
6238 }
6239 }
6240 else {
6241 p++;
6242 s += Py_UNICODE_SIZE;
6243 }
6244 }
6245
Victor Stinnerfe226c02011-10-03 03:52:20 +02006246 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006247 goto onError;
6248 Py_XDECREF(errorHandler);
6249 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006250#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006251 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006252 Py_DECREF(v);
6253 return NULL;
6254 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006255#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006256 assert(_PyUnicode_CheckConsistency(v, 1));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006257 return (PyObject *)v;
6258
Benjamin Peterson29060642009-01-31 22:14:21 +00006259 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006260 Py_XDECREF(v);
6261 Py_XDECREF(errorHandler);
6262 Py_XDECREF(exc);
6263 return NULL;
6264}
6265
Guido van Rossumd57fd912000-03-10 22:53:23 +00006266/* --- Latin-1 Codec ------------------------------------------------------ */
6267
Alexander Belopolsky40018472011-02-26 01:02:56 +00006268PyObject *
6269PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006270 Py_ssize_t size,
6271 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006272{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006273 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006274 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006275}
6276
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006277/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006278static void
6279make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006280 const char *encoding,
6281 const Py_UNICODE *unicode, Py_ssize_t size,
6282 Py_ssize_t startpos, Py_ssize_t endpos,
6283 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006284{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006285 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006286 *exceptionObject = PyUnicodeEncodeError_Create(
6287 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006288 }
6289 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006290 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6291 goto onError;
6292 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6293 goto onError;
6294 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6295 goto onError;
6296 return;
6297 onError:
6298 Py_DECREF(*exceptionObject);
6299 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006300 }
6301}
6302
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006303/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006304static void
6305raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006306 const char *encoding,
6307 const Py_UNICODE *unicode, Py_ssize_t size,
6308 Py_ssize_t startpos, Py_ssize_t endpos,
6309 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006310{
6311 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006312 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006313 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006314 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006315}
6316
6317/* error handling callback helper:
6318 build arguments, call the callback and check the arguments,
6319 put the result into newpos and return the replacement string, which
6320 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006321static PyObject *
6322unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006323 PyObject **errorHandler,
6324 const char *encoding, const char *reason,
6325 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
6326 Py_ssize_t startpos, Py_ssize_t endpos,
6327 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006328{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006329 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006330
6331 PyObject *restuple;
6332 PyObject *resunicode;
6333
6334 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006335 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006336 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006337 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006338 }
6339
6340 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006341 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006342 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006343 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006344
6345 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006346 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006347 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006348 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006349 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006350 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006351 Py_DECREF(restuple);
6352 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006353 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006354 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006355 &resunicode, newpos)) {
6356 Py_DECREF(restuple);
6357 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006358 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006359 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6360 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6361 Py_DECREF(restuple);
6362 return NULL;
6363 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006364 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006365 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006366 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006367 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6368 Py_DECREF(restuple);
6369 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006370 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006371 Py_INCREF(resunicode);
6372 Py_DECREF(restuple);
6373 return resunicode;
6374}
6375
Alexander Belopolsky40018472011-02-26 01:02:56 +00006376static PyObject *
6377unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006378 Py_ssize_t size,
6379 const char *errors,
6380 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006381{
6382 /* output object */
6383 PyObject *res;
6384 /* pointers to the beginning and end+1 of input */
6385 const Py_UNICODE *startp = p;
6386 const Py_UNICODE *endp = p + size;
6387 /* pointer to the beginning of the unencodable characters */
6388 /* const Py_UNICODE *badp = NULL; */
6389 /* pointer into the output */
6390 char *str;
6391 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006392 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006393 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6394 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006395 PyObject *errorHandler = NULL;
6396 PyObject *exc = NULL;
6397 /* the following variable is used for caching string comparisons
6398 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6399 int known_errorHandler = -1;
6400
6401 /* allocate enough for a simple encoding without
6402 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006403 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006404 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006405 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006406 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006407 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006408 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006409 ressize = size;
6410
6411 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006412 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006413
Benjamin Peterson29060642009-01-31 22:14:21 +00006414 /* can we encode this? */
6415 if (c<limit) {
6416 /* no overflow check, because we know that the space is enough */
6417 *str++ = (char)c;
6418 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006419 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006420 else {
6421 Py_ssize_t unicodepos = p-startp;
6422 Py_ssize_t requiredsize;
6423 PyObject *repunicode;
6424 Py_ssize_t repsize;
6425 Py_ssize_t newpos;
6426 Py_ssize_t respos;
6427 Py_UNICODE *uni2;
6428 /* startpos for collecting unencodable chars */
6429 const Py_UNICODE *collstart = p;
6430 const Py_UNICODE *collend = p;
6431 /* find all unecodable characters */
6432 while ((collend < endp) && ((*collend)>=limit))
6433 ++collend;
6434 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6435 if (known_errorHandler==-1) {
6436 if ((errors==NULL) || (!strcmp(errors, "strict")))
6437 known_errorHandler = 1;
6438 else if (!strcmp(errors, "replace"))
6439 known_errorHandler = 2;
6440 else if (!strcmp(errors, "ignore"))
6441 known_errorHandler = 3;
6442 else if (!strcmp(errors, "xmlcharrefreplace"))
6443 known_errorHandler = 4;
6444 else
6445 known_errorHandler = 0;
6446 }
6447 switch (known_errorHandler) {
6448 case 1: /* strict */
6449 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
6450 goto onError;
6451 case 2: /* replace */
6452 while (collstart++<collend)
6453 *str++ = '?'; /* fall through */
6454 case 3: /* ignore */
6455 p = collend;
6456 break;
6457 case 4: /* xmlcharrefreplace */
6458 respos = str - PyBytes_AS_STRING(res);
6459 /* determine replacement size (temporarily (mis)uses p) */
6460 for (p = collstart, repsize = 0; p < collend; ++p) {
6461 if (*p<10)
6462 repsize += 2+1+1;
6463 else if (*p<100)
6464 repsize += 2+2+1;
6465 else if (*p<1000)
6466 repsize += 2+3+1;
6467 else if (*p<10000)
6468 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006469#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006470 else
6471 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006472#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006473 else if (*p<100000)
6474 repsize += 2+5+1;
6475 else if (*p<1000000)
6476 repsize += 2+6+1;
6477 else
6478 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006479#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006480 }
6481 requiredsize = respos+repsize+(endp-collend);
6482 if (requiredsize > ressize) {
6483 if (requiredsize<2*ressize)
6484 requiredsize = 2*ressize;
6485 if (_PyBytes_Resize(&res, requiredsize))
6486 goto onError;
6487 str = PyBytes_AS_STRING(res) + respos;
6488 ressize = requiredsize;
6489 }
6490 /* generate replacement (temporarily (mis)uses p) */
6491 for (p = collstart; p < collend; ++p) {
6492 str += sprintf(str, "&#%d;", (int)*p);
6493 }
6494 p = collend;
6495 break;
6496 default:
6497 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6498 encoding, reason, startp, size, &exc,
6499 collstart-startp, collend-startp, &newpos);
6500 if (repunicode == NULL)
6501 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006502 if (PyBytes_Check(repunicode)) {
6503 /* Directly copy bytes result to output. */
6504 repsize = PyBytes_Size(repunicode);
6505 if (repsize > 1) {
6506 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006507 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006508 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6509 Py_DECREF(repunicode);
6510 goto onError;
6511 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006512 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006513 ressize += repsize-1;
6514 }
6515 memcpy(str, PyBytes_AsString(repunicode), repsize);
6516 str += repsize;
6517 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006518 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006519 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006520 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006521 /* need more space? (at least enough for what we
6522 have+the replacement+the rest of the string, so
6523 we won't have to check space for encodable characters) */
6524 respos = str - PyBytes_AS_STRING(res);
6525 repsize = PyUnicode_GET_SIZE(repunicode);
6526 requiredsize = respos+repsize+(endp-collend);
6527 if (requiredsize > ressize) {
6528 if (requiredsize<2*ressize)
6529 requiredsize = 2*ressize;
6530 if (_PyBytes_Resize(&res, requiredsize)) {
6531 Py_DECREF(repunicode);
6532 goto onError;
6533 }
6534 str = PyBytes_AS_STRING(res) + respos;
6535 ressize = requiredsize;
6536 }
6537 /* check if there is anything unencodable in the replacement
6538 and copy it to the output */
6539 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
6540 c = *uni2;
6541 if (c >= limit) {
6542 raise_encode_exception(&exc, encoding, startp, size,
6543 unicodepos, unicodepos+1, reason);
6544 Py_DECREF(repunicode);
6545 goto onError;
6546 }
6547 *str = (char)c;
6548 }
6549 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006550 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006551 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006552 }
6553 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006554 /* Resize if we allocated to much */
6555 size = str - PyBytes_AS_STRING(res);
6556 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006557 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006558 if (_PyBytes_Resize(&res, size) < 0)
6559 goto onError;
6560 }
6561
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006562 Py_XDECREF(errorHandler);
6563 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006564 return res;
6565
6566 onError:
6567 Py_XDECREF(res);
6568 Py_XDECREF(errorHandler);
6569 Py_XDECREF(exc);
6570 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006571}
6572
Alexander Belopolsky40018472011-02-26 01:02:56 +00006573PyObject *
6574PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006575 Py_ssize_t size,
6576 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006577{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006578 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006579}
6580
Alexander Belopolsky40018472011-02-26 01:02:56 +00006581PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006582_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006583{
6584 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006585 PyErr_BadArgument();
6586 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006587 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006588 if (PyUnicode_READY(unicode) == -1)
6589 return NULL;
6590 /* Fast path: if it is a one-byte string, construct
6591 bytes object directly. */
6592 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6593 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6594 PyUnicode_GET_LENGTH(unicode));
6595 /* Non-Latin-1 characters present. Defer to above function to
6596 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006597 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006598 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006599 errors);
6600}
6601
6602PyObject*
6603PyUnicode_AsLatin1String(PyObject *unicode)
6604{
6605 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006606}
6607
6608/* --- 7-bit ASCII Codec -------------------------------------------------- */
6609
Alexander Belopolsky40018472011-02-26 01:02:56 +00006610PyObject *
6611PyUnicode_DecodeASCII(const char *s,
6612 Py_ssize_t size,
6613 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006614{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006615 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006616 PyUnicodeObject *v;
Victor Stinner702c7342011-10-05 13:50:52 +02006617 Py_UNICODE *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006618 Py_ssize_t startinpos;
6619 Py_ssize_t endinpos;
6620 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006621 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006622 int has_error;
6623 const unsigned char *p = (const unsigned char *)s;
6624 const unsigned char *end = p + size;
6625 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006626 PyObject *errorHandler = NULL;
6627 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006628
Guido van Rossumd57fd912000-03-10 22:53:23 +00006629 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006630 if (size == 1 && (unsigned char)s[0] < 128)
6631 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006632
Victor Stinner702c7342011-10-05 13:50:52 +02006633 has_error = 0;
6634 while (p < end && !has_error) {
6635 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6636 an explanation. */
6637 if (!((size_t) p & LONG_PTR_MASK)) {
6638 /* Help register allocation */
6639 register const unsigned char *_p = p;
6640 while (_p < aligned_end) {
6641 unsigned long value = *(unsigned long *) _p;
6642 if (value & ASCII_CHAR_MASK) {
6643 has_error = 1;
6644 break;
6645 }
6646 _p += SIZEOF_LONG;
6647 }
6648 if (_p == end)
6649 break;
6650 if (has_error)
6651 break;
6652 p = _p;
6653 }
6654 if (*p & 0x80) {
6655 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006656 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006657 }
6658 else {
6659 ++p;
6660 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006661 }
Victor Stinner702c7342011-10-05 13:50:52 +02006662 if (!has_error)
6663 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006664
Guido van Rossumd57fd912000-03-10 22:53:23 +00006665 v = _PyUnicode_New(size);
6666 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006667 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006668 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006669 return (PyObject *)v;
Victor Stinner702c7342011-10-05 13:50:52 +02006670 u = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006671 e = s + size;
6672 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006673 register unsigned char c = (unsigned char)*s;
6674 if (c < 128) {
Victor Stinner702c7342011-10-05 13:50:52 +02006675 *u++ = c;
Benjamin Peterson29060642009-01-31 22:14:21 +00006676 ++s;
6677 }
6678 else {
6679 startinpos = s-starts;
6680 endinpos = startinpos + 1;
Victor Stinner702c7342011-10-05 13:50:52 +02006681 outpos = u - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006682 if (unicode_decode_call_errorhandler(
6683 errors, &errorHandler,
6684 "ascii", "ordinal not in range(128)",
6685 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinner702c7342011-10-05 13:50:52 +02006686 &v, &outpos, &u))
Benjamin Peterson29060642009-01-31 22:14:21 +00006687 goto onError;
6688 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006689 }
Victor Stinner702c7342011-10-05 13:50:52 +02006690 if (u - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
6691 if (PyUnicode_Resize((PyObject**)&v, u - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006692 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006693 Py_XDECREF(errorHandler);
6694 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006695#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006696 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006697 Py_DECREF(v);
6698 return NULL;
6699 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006700#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006701 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006702 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006703
Benjamin Peterson29060642009-01-31 22:14:21 +00006704 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006705 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006706 Py_XDECREF(errorHandler);
6707 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006708 return NULL;
6709}
6710
Alexander Belopolsky40018472011-02-26 01:02:56 +00006711PyObject *
6712PyUnicode_EncodeASCII(const Py_UNICODE *p,
6713 Py_ssize_t size,
6714 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006715{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006716 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006717}
6718
Alexander Belopolsky40018472011-02-26 01:02:56 +00006719PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006720_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006721{
6722 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006723 PyErr_BadArgument();
6724 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006725 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006726 if (PyUnicode_READY(unicode) == -1)
6727 return NULL;
6728 /* Fast path: if it is an ASCII-only string, construct bytes object
6729 directly. Else defer to above function to raise the exception. */
6730 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6731 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6732 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006733 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006734 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006735 errors);
6736}
6737
6738PyObject *
6739PyUnicode_AsASCIIString(PyObject *unicode)
6740{
6741 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006742}
6743
Victor Stinner99b95382011-07-04 14:23:54 +02006744#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006745
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006746/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006747
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006748#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006749#define NEED_RETRY
6750#endif
6751
6752/* XXX This code is limited to "true" double-byte encodings, as
6753 a) it assumes an incomplete character consists of a single byte, and
6754 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006755 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006756
Alexander Belopolsky40018472011-02-26 01:02:56 +00006757static int
6758is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006759{
6760 const char *curr = s + offset;
6761
6762 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006763 const char *prev = CharPrev(s, curr);
6764 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006765 }
6766 return 0;
6767}
6768
6769/*
6770 * Decode MBCS string into unicode object. If 'final' is set, converts
6771 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6772 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006773static int
6774decode_mbcs(PyUnicodeObject **v,
6775 const char *s, /* MBCS string */
6776 int size, /* sizeof MBCS string */
6777 int final,
6778 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006779{
6780 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006781 Py_ssize_t n;
6782 DWORD usize;
6783 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006784
6785 assert(size >= 0);
6786
Victor Stinner554f3f02010-06-16 23:33:54 +00006787 /* check and handle 'errors' arg */
6788 if (errors==NULL || strcmp(errors, "strict")==0)
6789 flags = MB_ERR_INVALID_CHARS;
6790 else if (strcmp(errors, "ignore")==0)
6791 flags = 0;
6792 else {
6793 PyErr_Format(PyExc_ValueError,
6794 "mbcs encoding does not support errors='%s'",
6795 errors);
6796 return -1;
6797 }
6798
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006799 /* Skip trailing lead-byte unless 'final' is set */
6800 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006801 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006802
6803 /* First get the size of the result */
6804 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006805 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6806 if (usize==0)
6807 goto mbcs_decode_error;
6808 } else
6809 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006810
6811 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006812 /* Create unicode object */
6813 *v = _PyUnicode_New(usize);
6814 if (*v == NULL)
6815 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006816 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006817 }
6818 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006819 /* Extend unicode object */
6820 n = PyUnicode_GET_SIZE(*v);
Victor Stinner2fd82272011-10-03 04:06:05 +02006821 if (PyUnicode_Resize((PyObject**)v, n + usize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006822 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006823 }
6824
6825 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006826 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006827 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006828 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6829 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006830 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006831 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006832 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006833
6834mbcs_decode_error:
6835 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6836 we raise a UnicodeDecodeError - else it is a 'generic'
6837 windows error
6838 */
6839 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6840 /* Ideally, we should get reason from FormatMessage - this
6841 is the Windows 2000 English version of the message
6842 */
6843 PyObject *exc = NULL;
6844 const char *reason = "No mapping for the Unicode character exists "
6845 "in the target multi-byte code page.";
6846 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6847 if (exc != NULL) {
6848 PyCodec_StrictErrors(exc);
6849 Py_DECREF(exc);
6850 }
6851 } else {
6852 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6853 }
6854 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006855}
6856
Alexander Belopolsky40018472011-02-26 01:02:56 +00006857PyObject *
6858PyUnicode_DecodeMBCSStateful(const char *s,
6859 Py_ssize_t size,
6860 const char *errors,
6861 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006862{
6863 PyUnicodeObject *v = NULL;
6864 int done;
6865
6866 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006867 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006868
6869#ifdef NEED_RETRY
6870 retry:
6871 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006872 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006873 else
6874#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006875 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006876
6877 if (done < 0) {
6878 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006879 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006880 }
6881
6882 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006883 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006884
6885#ifdef NEED_RETRY
6886 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006887 s += done;
6888 size -= done;
6889 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006890 }
6891#endif
Victor Stinner17efeed2011-10-04 20:05:46 +02006892#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006893 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006894 Py_DECREF(v);
6895 return NULL;
6896 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006897#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006898 assert(_PyUnicode_CheckConsistency(v, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006899 return (PyObject *)v;
6900}
6901
Alexander Belopolsky40018472011-02-26 01:02:56 +00006902PyObject *
6903PyUnicode_DecodeMBCS(const char *s,
6904 Py_ssize_t size,
6905 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006906{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006907 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6908}
6909
6910/*
6911 * Convert unicode into string object (MBCS).
6912 * Returns 0 if succeed, -1 otherwise.
6913 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006914static int
6915encode_mbcs(PyObject **repr,
6916 const Py_UNICODE *p, /* unicode */
6917 int size, /* size of unicode */
6918 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006919{
Victor Stinner554f3f02010-06-16 23:33:54 +00006920 BOOL usedDefaultChar = FALSE;
6921 BOOL *pusedDefaultChar;
6922 int mbcssize;
6923 Py_ssize_t n;
6924 PyObject *exc = NULL;
6925 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006926
6927 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006928
Victor Stinner554f3f02010-06-16 23:33:54 +00006929 /* check and handle 'errors' arg */
6930 if (errors==NULL || strcmp(errors, "strict")==0) {
6931 flags = WC_NO_BEST_FIT_CHARS;
6932 pusedDefaultChar = &usedDefaultChar;
6933 } else if (strcmp(errors, "replace")==0) {
6934 flags = 0;
6935 pusedDefaultChar = NULL;
6936 } else {
6937 PyErr_Format(PyExc_ValueError,
6938 "mbcs encoding does not support errors='%s'",
6939 errors);
6940 return -1;
6941 }
6942
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006943 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006944 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006945 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6946 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00006947 if (mbcssize == 0) {
6948 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6949 return -1;
6950 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006951 /* If we used a default char, then we failed! */
6952 if (pusedDefaultChar && *pusedDefaultChar)
6953 goto mbcs_encode_error;
6954 } else {
6955 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006956 }
6957
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006958 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006959 /* Create string object */
6960 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6961 if (*repr == NULL)
6962 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006963 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006964 }
6965 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006966 /* Extend string object */
6967 n = PyBytes_Size(*repr);
6968 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6969 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006970 }
6971
6972 /* Do the conversion */
6973 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006974 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006975 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6976 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006977 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6978 return -1;
6979 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006980 if (pusedDefaultChar && *pusedDefaultChar)
6981 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006982 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006983 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006984
6985mbcs_encode_error:
6986 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6987 Py_XDECREF(exc);
6988 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006989}
6990
Alexander Belopolsky40018472011-02-26 01:02:56 +00006991PyObject *
6992PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6993 Py_ssize_t size,
6994 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006995{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006996 PyObject *repr = NULL;
6997 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00006998
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006999#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00007000 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007001 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00007002 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007003 else
7004#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00007005 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007006
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007007 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007008 Py_XDECREF(repr);
7009 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007010 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007011
7012#ifdef NEED_RETRY
7013 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007014 p += INT_MAX;
7015 size -= INT_MAX;
7016 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007017 }
7018#endif
7019
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007020 return repr;
7021}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007022
Alexander Belopolsky40018472011-02-26 01:02:56 +00007023PyObject *
7024PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007025{
7026 if (!PyUnicode_Check(unicode)) {
7027 PyErr_BadArgument();
7028 return NULL;
7029 }
7030 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007031 PyUnicode_GET_SIZE(unicode),
7032 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007033}
7034
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007035#undef NEED_RETRY
7036
Victor Stinner99b95382011-07-04 14:23:54 +02007037#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007038
Guido van Rossumd57fd912000-03-10 22:53:23 +00007039/* --- Character Mapping Codec -------------------------------------------- */
7040
Alexander Belopolsky40018472011-02-26 01:02:56 +00007041PyObject *
7042PyUnicode_DecodeCharmap(const char *s,
7043 Py_ssize_t size,
7044 PyObject *mapping,
7045 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007046{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007047 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007048 Py_ssize_t startinpos;
7049 Py_ssize_t endinpos;
7050 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007051 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007052 PyUnicodeObject *v;
7053 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007054 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007055 PyObject *errorHandler = NULL;
7056 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007057 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007058 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00007059
Guido van Rossumd57fd912000-03-10 22:53:23 +00007060 /* Default to Latin-1 */
7061 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007062 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007063
7064 v = _PyUnicode_New(size);
7065 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007066 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007067 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007068 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007069 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007070 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007071 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007072 mapstring = PyUnicode_AS_UNICODE(mapping);
7073 maplen = PyUnicode_GET_SIZE(mapping);
7074 while (s < e) {
7075 unsigned char ch = *s;
7076 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007077
Benjamin Peterson29060642009-01-31 22:14:21 +00007078 if (ch < maplen)
7079 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00007080
Benjamin Peterson29060642009-01-31 22:14:21 +00007081 if (x == 0xfffe) {
7082 /* undefined mapping */
7083 outpos = p-PyUnicode_AS_UNICODE(v);
7084 startinpos = s-starts;
7085 endinpos = startinpos+1;
7086 if (unicode_decode_call_errorhandler(
7087 errors, &errorHandler,
7088 "charmap", "character maps to <undefined>",
7089 &starts, &e, &startinpos, &endinpos, &exc, &s,
7090 &v, &outpos, &p)) {
7091 goto onError;
7092 }
7093 continue;
7094 }
7095 *p++ = x;
7096 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007097 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007098 }
7099 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007100 while (s < e) {
7101 unsigned char ch = *s;
7102 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007103
Benjamin Peterson29060642009-01-31 22:14:21 +00007104 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7105 w = PyLong_FromLong((long)ch);
7106 if (w == NULL)
7107 goto onError;
7108 x = PyObject_GetItem(mapping, w);
7109 Py_DECREF(w);
7110 if (x == NULL) {
7111 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7112 /* No mapping found means: mapping is undefined. */
7113 PyErr_Clear();
7114 x = Py_None;
7115 Py_INCREF(x);
7116 } else
7117 goto onError;
7118 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007119
Benjamin Peterson29060642009-01-31 22:14:21 +00007120 /* Apply mapping */
7121 if (PyLong_Check(x)) {
7122 long value = PyLong_AS_LONG(x);
7123 if (value < 0 || value > 65535) {
7124 PyErr_SetString(PyExc_TypeError,
7125 "character mapping must be in range(65536)");
7126 Py_DECREF(x);
7127 goto onError;
7128 }
7129 *p++ = (Py_UNICODE)value;
7130 }
7131 else if (x == Py_None) {
7132 /* undefined mapping */
7133 outpos = p-PyUnicode_AS_UNICODE(v);
7134 startinpos = s-starts;
7135 endinpos = startinpos+1;
7136 if (unicode_decode_call_errorhandler(
7137 errors, &errorHandler,
7138 "charmap", "character maps to <undefined>",
7139 &starts, &e, &startinpos, &endinpos, &exc, &s,
7140 &v, &outpos, &p)) {
7141 Py_DECREF(x);
7142 goto onError;
7143 }
7144 Py_DECREF(x);
7145 continue;
7146 }
7147 else if (PyUnicode_Check(x)) {
7148 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007149
Benjamin Peterson29060642009-01-31 22:14:21 +00007150 if (targetsize == 1)
7151 /* 1-1 mapping */
7152 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007153
Benjamin Peterson29060642009-01-31 22:14:21 +00007154 else if (targetsize > 1) {
7155 /* 1-n mapping */
7156 if (targetsize > extrachars) {
7157 /* resize first */
7158 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
7159 Py_ssize_t needed = (targetsize - extrachars) + \
7160 (targetsize << 2);
7161 extrachars += needed;
7162 /* XXX overflow detection missing */
Victor Stinnerfe226c02011-10-03 03:52:20 +02007163 if (PyUnicode_Resize((PyObject**)&v,
Benjamin Peterson29060642009-01-31 22:14:21 +00007164 PyUnicode_GET_SIZE(v) + needed) < 0) {
7165 Py_DECREF(x);
7166 goto onError;
7167 }
7168 p = PyUnicode_AS_UNICODE(v) + oldpos;
7169 }
7170 Py_UNICODE_COPY(p,
7171 PyUnicode_AS_UNICODE(x),
7172 targetsize);
7173 p += targetsize;
7174 extrachars -= targetsize;
7175 }
7176 /* 1-0 mapping: skip the character */
7177 }
7178 else {
7179 /* wrong return value */
7180 PyErr_SetString(PyExc_TypeError,
7181 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007182 Py_DECREF(x);
7183 goto onError;
7184 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007185 Py_DECREF(x);
7186 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007187 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007188 }
7189 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02007190 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007191 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007192 Py_XDECREF(errorHandler);
7193 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02007194#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007195 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007196 Py_DECREF(v);
7197 return NULL;
7198 }
Victor Stinner17efeed2011-10-04 20:05:46 +02007199#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007200 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007201 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00007202
Benjamin Peterson29060642009-01-31 22:14:21 +00007203 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007204 Py_XDECREF(errorHandler);
7205 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007206 Py_XDECREF(v);
7207 return NULL;
7208}
7209
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007210/* Charmap encoding: the lookup table */
7211
Alexander Belopolsky40018472011-02-26 01:02:56 +00007212struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007213 PyObject_HEAD
7214 unsigned char level1[32];
7215 int count2, count3;
7216 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007217};
7218
7219static PyObject*
7220encoding_map_size(PyObject *obj, PyObject* args)
7221{
7222 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007223 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007224 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007225}
7226
7227static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007228 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007229 PyDoc_STR("Return the size (in bytes) of this object") },
7230 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007231};
7232
7233static void
7234encoding_map_dealloc(PyObject* o)
7235{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007236 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007237}
7238
7239static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007240 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007241 "EncodingMap", /*tp_name*/
7242 sizeof(struct encoding_map), /*tp_basicsize*/
7243 0, /*tp_itemsize*/
7244 /* methods */
7245 encoding_map_dealloc, /*tp_dealloc*/
7246 0, /*tp_print*/
7247 0, /*tp_getattr*/
7248 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007249 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007250 0, /*tp_repr*/
7251 0, /*tp_as_number*/
7252 0, /*tp_as_sequence*/
7253 0, /*tp_as_mapping*/
7254 0, /*tp_hash*/
7255 0, /*tp_call*/
7256 0, /*tp_str*/
7257 0, /*tp_getattro*/
7258 0, /*tp_setattro*/
7259 0, /*tp_as_buffer*/
7260 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7261 0, /*tp_doc*/
7262 0, /*tp_traverse*/
7263 0, /*tp_clear*/
7264 0, /*tp_richcompare*/
7265 0, /*tp_weaklistoffset*/
7266 0, /*tp_iter*/
7267 0, /*tp_iternext*/
7268 encoding_map_methods, /*tp_methods*/
7269 0, /*tp_members*/
7270 0, /*tp_getset*/
7271 0, /*tp_base*/
7272 0, /*tp_dict*/
7273 0, /*tp_descr_get*/
7274 0, /*tp_descr_set*/
7275 0, /*tp_dictoffset*/
7276 0, /*tp_init*/
7277 0, /*tp_alloc*/
7278 0, /*tp_new*/
7279 0, /*tp_free*/
7280 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007281};
7282
7283PyObject*
7284PyUnicode_BuildEncodingMap(PyObject* string)
7285{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007286 PyObject *result;
7287 struct encoding_map *mresult;
7288 int i;
7289 int need_dict = 0;
7290 unsigned char level1[32];
7291 unsigned char level2[512];
7292 unsigned char *mlevel1, *mlevel2, *mlevel3;
7293 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007294 int kind;
7295 void *data;
7296 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007297
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007298 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007299 PyErr_BadArgument();
7300 return NULL;
7301 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007302 kind = PyUnicode_KIND(string);
7303 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007304 memset(level1, 0xFF, sizeof level1);
7305 memset(level2, 0xFF, sizeof level2);
7306
7307 /* If there isn't a one-to-one mapping of NULL to \0,
7308 or if there are non-BMP characters, we need to use
7309 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007310 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007311 need_dict = 1;
7312 for (i = 1; i < 256; i++) {
7313 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007314 ch = PyUnicode_READ(kind, data, i);
7315 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007316 need_dict = 1;
7317 break;
7318 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007319 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007320 /* unmapped character */
7321 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007322 l1 = ch >> 11;
7323 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007324 if (level1[l1] == 0xFF)
7325 level1[l1] = count2++;
7326 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007327 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007328 }
7329
7330 if (count2 >= 0xFF || count3 >= 0xFF)
7331 need_dict = 1;
7332
7333 if (need_dict) {
7334 PyObject *result = PyDict_New();
7335 PyObject *key, *value;
7336 if (!result)
7337 return NULL;
7338 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007339 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007340 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007341 if (!key || !value)
7342 goto failed1;
7343 if (PyDict_SetItem(result, key, value) == -1)
7344 goto failed1;
7345 Py_DECREF(key);
7346 Py_DECREF(value);
7347 }
7348 return result;
7349 failed1:
7350 Py_XDECREF(key);
7351 Py_XDECREF(value);
7352 Py_DECREF(result);
7353 return NULL;
7354 }
7355
7356 /* Create a three-level trie */
7357 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7358 16*count2 + 128*count3 - 1);
7359 if (!result)
7360 return PyErr_NoMemory();
7361 PyObject_Init(result, &EncodingMapType);
7362 mresult = (struct encoding_map*)result;
7363 mresult->count2 = count2;
7364 mresult->count3 = count3;
7365 mlevel1 = mresult->level1;
7366 mlevel2 = mresult->level23;
7367 mlevel3 = mresult->level23 + 16*count2;
7368 memcpy(mlevel1, level1, 32);
7369 memset(mlevel2, 0xFF, 16*count2);
7370 memset(mlevel3, 0, 128*count3);
7371 count3 = 0;
7372 for (i = 1; i < 256; i++) {
7373 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007374 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007375 /* unmapped character */
7376 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007377 o1 = PyUnicode_READ(kind, data, i)>>11;
7378 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007379 i2 = 16*mlevel1[o1] + o2;
7380 if (mlevel2[i2] == 0xFF)
7381 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007382 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007383 i3 = 128*mlevel2[i2] + o3;
7384 mlevel3[i3] = i;
7385 }
7386 return result;
7387}
7388
7389static int
7390encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
7391{
7392 struct encoding_map *map = (struct encoding_map*)mapping;
7393 int l1 = c>>11;
7394 int l2 = (c>>7) & 0xF;
7395 int l3 = c & 0x7F;
7396 int i;
7397
7398#ifdef Py_UNICODE_WIDE
7399 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007400 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007401 }
7402#endif
7403 if (c == 0)
7404 return 0;
7405 /* level 1*/
7406 i = map->level1[l1];
7407 if (i == 0xFF) {
7408 return -1;
7409 }
7410 /* level 2*/
7411 i = map->level23[16*i+l2];
7412 if (i == 0xFF) {
7413 return -1;
7414 }
7415 /* level 3 */
7416 i = map->level23[16*map->count2 + 128*i + l3];
7417 if (i == 0) {
7418 return -1;
7419 }
7420 return i;
7421}
7422
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007423/* Lookup the character ch in the mapping. If the character
7424 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007425 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007426static PyObject *
7427charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007428{
Christian Heimes217cfd12007-12-02 14:31:20 +00007429 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007430 PyObject *x;
7431
7432 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007433 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007434 x = PyObject_GetItem(mapping, w);
7435 Py_DECREF(w);
7436 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007437 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7438 /* No mapping found means: mapping is undefined. */
7439 PyErr_Clear();
7440 x = Py_None;
7441 Py_INCREF(x);
7442 return x;
7443 } else
7444 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007445 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007446 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007447 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007448 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007449 long value = PyLong_AS_LONG(x);
7450 if (value < 0 || value > 255) {
7451 PyErr_SetString(PyExc_TypeError,
7452 "character mapping must be in range(256)");
7453 Py_DECREF(x);
7454 return NULL;
7455 }
7456 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007457 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007458 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007459 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007460 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007461 /* wrong return value */
7462 PyErr_Format(PyExc_TypeError,
7463 "character mapping must return integer, bytes or None, not %.400s",
7464 x->ob_type->tp_name);
7465 Py_DECREF(x);
7466 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007467 }
7468}
7469
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007470static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007471charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007472{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007473 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7474 /* exponentially overallocate to minimize reallocations */
7475 if (requiredsize < 2*outsize)
7476 requiredsize = 2*outsize;
7477 if (_PyBytes_Resize(outobj, requiredsize))
7478 return -1;
7479 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007480}
7481
Benjamin Peterson14339b62009-01-31 16:36:08 +00007482typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007483 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007484} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007485/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007486 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007487 space is available. Return a new reference to the object that
7488 was put in the output buffer, or Py_None, if the mapping was undefined
7489 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007490 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007491static charmapencode_result
7492charmapencode_output(Py_UNICODE c, PyObject *mapping,
7493 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007494{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007495 PyObject *rep;
7496 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007497 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007498
Christian Heimes90aa7642007-12-19 02:45:37 +00007499 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007500 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007501 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007502 if (res == -1)
7503 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007504 if (outsize<requiredsize)
7505 if (charmapencode_resize(outobj, outpos, requiredsize))
7506 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007507 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007508 outstart[(*outpos)++] = (char)res;
7509 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007510 }
7511
7512 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007513 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007514 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007515 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007516 Py_DECREF(rep);
7517 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007518 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007519 if (PyLong_Check(rep)) {
7520 Py_ssize_t requiredsize = *outpos+1;
7521 if (outsize<requiredsize)
7522 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7523 Py_DECREF(rep);
7524 return enc_EXCEPTION;
7525 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007526 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007527 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007528 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007529 else {
7530 const char *repchars = PyBytes_AS_STRING(rep);
7531 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7532 Py_ssize_t requiredsize = *outpos+repsize;
7533 if (outsize<requiredsize)
7534 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7535 Py_DECREF(rep);
7536 return enc_EXCEPTION;
7537 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007538 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007539 memcpy(outstart + *outpos, repchars, repsize);
7540 *outpos += repsize;
7541 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007542 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007543 Py_DECREF(rep);
7544 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007545}
7546
7547/* handle an error in PyUnicode_EncodeCharmap
7548 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007549static int
7550charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00007551 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007552 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007553 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007554 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007555{
7556 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007557 Py_ssize_t repsize;
7558 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007559 Py_UNICODE *uni2;
7560 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007561 Py_ssize_t collstartpos = *inpos;
7562 Py_ssize_t collendpos = *inpos+1;
7563 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007564 char *encoding = "charmap";
7565 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007566 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007567
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007568 /* find all unencodable characters */
7569 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007570 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007571 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007572 int res = encoding_map_lookup(p[collendpos], mapping);
7573 if (res != -1)
7574 break;
7575 ++collendpos;
7576 continue;
7577 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007578
Benjamin Peterson29060642009-01-31 22:14:21 +00007579 rep = charmapencode_lookup(p[collendpos], mapping);
7580 if (rep==NULL)
7581 return -1;
7582 else if (rep!=Py_None) {
7583 Py_DECREF(rep);
7584 break;
7585 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007586 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007587 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007588 }
7589 /* cache callback name lookup
7590 * (if not done yet, i.e. it's the first error) */
7591 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007592 if ((errors==NULL) || (!strcmp(errors, "strict")))
7593 *known_errorHandler = 1;
7594 else if (!strcmp(errors, "replace"))
7595 *known_errorHandler = 2;
7596 else if (!strcmp(errors, "ignore"))
7597 *known_errorHandler = 3;
7598 else if (!strcmp(errors, "xmlcharrefreplace"))
7599 *known_errorHandler = 4;
7600 else
7601 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007602 }
7603 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007604 case 1: /* strict */
7605 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7606 return -1;
7607 case 2: /* replace */
7608 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007609 x = charmapencode_output('?', mapping, res, respos);
7610 if (x==enc_EXCEPTION) {
7611 return -1;
7612 }
7613 else if (x==enc_FAILED) {
7614 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7615 return -1;
7616 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007617 }
7618 /* fall through */
7619 case 3: /* ignore */
7620 *inpos = collendpos;
7621 break;
7622 case 4: /* xmlcharrefreplace */
7623 /* generate replacement (temporarily (mis)uses p) */
7624 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007625 char buffer[2+29+1+1];
7626 char *cp;
7627 sprintf(buffer, "&#%d;", (int)p[collpos]);
7628 for (cp = buffer; *cp; ++cp) {
7629 x = charmapencode_output(*cp, mapping, res, respos);
7630 if (x==enc_EXCEPTION)
7631 return -1;
7632 else if (x==enc_FAILED) {
7633 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7634 return -1;
7635 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007636 }
7637 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007638 *inpos = collendpos;
7639 break;
7640 default:
7641 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00007642 encoding, reason, p, size, exceptionObject,
7643 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007644 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007645 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007646 if (PyBytes_Check(repunicode)) {
7647 /* Directly copy bytes result to output. */
7648 Py_ssize_t outsize = PyBytes_Size(*res);
7649 Py_ssize_t requiredsize;
7650 repsize = PyBytes_Size(repunicode);
7651 requiredsize = *respos + repsize;
7652 if (requiredsize > outsize)
7653 /* Make room for all additional bytes. */
7654 if (charmapencode_resize(res, respos, requiredsize)) {
7655 Py_DECREF(repunicode);
7656 return -1;
7657 }
7658 memcpy(PyBytes_AsString(*res) + *respos,
7659 PyBytes_AsString(repunicode), repsize);
7660 *respos += repsize;
7661 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007662 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007663 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007664 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007665 /* generate replacement */
7666 repsize = PyUnicode_GET_SIZE(repunicode);
7667 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007668 x = charmapencode_output(*uni2, mapping, res, respos);
7669 if (x==enc_EXCEPTION) {
7670 return -1;
7671 }
7672 else if (x==enc_FAILED) {
7673 Py_DECREF(repunicode);
7674 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7675 return -1;
7676 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007677 }
7678 *inpos = newpos;
7679 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007680 }
7681 return 0;
7682}
7683
Alexander Belopolsky40018472011-02-26 01:02:56 +00007684PyObject *
7685PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7686 Py_ssize_t size,
7687 PyObject *mapping,
7688 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007689{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007690 /* output object */
7691 PyObject *res = NULL;
7692 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007693 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007694 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007695 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007696 PyObject *errorHandler = NULL;
7697 PyObject *exc = NULL;
7698 /* the following variable is used for caching string comparisons
7699 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7700 * 3=ignore, 4=xmlcharrefreplace */
7701 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007702
7703 /* Default to Latin-1 */
7704 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007705 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007706
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007707 /* allocate enough for a simple encoding without
7708 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007709 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007710 if (res == NULL)
7711 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007712 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007713 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007714
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007715 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007716 /* try to encode it */
7717 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7718 if (x==enc_EXCEPTION) /* error */
7719 goto onError;
7720 if (x==enc_FAILED) { /* unencodable character */
7721 if (charmap_encoding_error(p, size, &inpos, mapping,
7722 &exc,
7723 &known_errorHandler, &errorHandler, errors,
7724 &res, &respos)) {
7725 goto onError;
7726 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007727 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007728 else
7729 /* done with this character => adjust input position */
7730 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007731 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007732
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007733 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007734 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007735 if (_PyBytes_Resize(&res, respos) < 0)
7736 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007737
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007738 Py_XDECREF(exc);
7739 Py_XDECREF(errorHandler);
7740 return res;
7741
Benjamin Peterson29060642009-01-31 22:14:21 +00007742 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007743 Py_XDECREF(res);
7744 Py_XDECREF(exc);
7745 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007746 return NULL;
7747}
7748
Alexander Belopolsky40018472011-02-26 01:02:56 +00007749PyObject *
7750PyUnicode_AsCharmapString(PyObject *unicode,
7751 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007752{
7753 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007754 PyErr_BadArgument();
7755 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007756 }
7757 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007758 PyUnicode_GET_SIZE(unicode),
7759 mapping,
7760 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007761}
7762
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007763/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007764static void
7765make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007766 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007767 Py_ssize_t startpos, Py_ssize_t endpos,
7768 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007769{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007770 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007771 *exceptionObject = _PyUnicodeTranslateError_Create(
7772 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007773 }
7774 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007775 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7776 goto onError;
7777 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7778 goto onError;
7779 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7780 goto onError;
7781 return;
7782 onError:
7783 Py_DECREF(*exceptionObject);
7784 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007785 }
7786}
7787
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007788/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007789static void
7790raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007791 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007792 Py_ssize_t startpos, Py_ssize_t endpos,
7793 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007794{
7795 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007796 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007797 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007798 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007799}
7800
7801/* error handling callback helper:
7802 build arguments, call the callback and check the arguments,
7803 put the result into newpos and return the replacement string, which
7804 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007805static PyObject *
7806unicode_translate_call_errorhandler(const char *errors,
7807 PyObject **errorHandler,
7808 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007809 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007810 Py_ssize_t startpos, Py_ssize_t endpos,
7811 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007812{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007813 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007814
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007815 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007816 PyObject *restuple;
7817 PyObject *resunicode;
7818
7819 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007820 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007821 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007822 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007823 }
7824
7825 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007826 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007827 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007828 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007829
7830 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007831 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007832 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007833 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007834 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007835 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007836 Py_DECREF(restuple);
7837 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007838 }
7839 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007840 &resunicode, &i_newpos)) {
7841 Py_DECREF(restuple);
7842 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007843 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007844 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007845 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007846 else
7847 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007848 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007849 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7850 Py_DECREF(restuple);
7851 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007852 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007853 Py_INCREF(resunicode);
7854 Py_DECREF(restuple);
7855 return resunicode;
7856}
7857
7858/* Lookup the character ch in the mapping and put the result in result,
7859 which must be decrefed by the caller.
7860 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007861static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007862charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007863{
Christian Heimes217cfd12007-12-02 14:31:20 +00007864 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007865 PyObject *x;
7866
7867 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007868 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007869 x = PyObject_GetItem(mapping, w);
7870 Py_DECREF(w);
7871 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007872 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7873 /* No mapping found means: use 1:1 mapping. */
7874 PyErr_Clear();
7875 *result = NULL;
7876 return 0;
7877 } else
7878 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007879 }
7880 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007881 *result = x;
7882 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007883 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007884 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007885 long value = PyLong_AS_LONG(x);
7886 long max = PyUnicode_GetMax();
7887 if (value < 0 || value > max) {
7888 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007889 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007890 Py_DECREF(x);
7891 return -1;
7892 }
7893 *result = x;
7894 return 0;
7895 }
7896 else if (PyUnicode_Check(x)) {
7897 *result = x;
7898 return 0;
7899 }
7900 else {
7901 /* wrong return value */
7902 PyErr_SetString(PyExc_TypeError,
7903 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007904 Py_DECREF(x);
7905 return -1;
7906 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007907}
7908/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00007909 if not reallocate and adjust various state variables.
7910 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007911static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007912charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00007913 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007914{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007915 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00007916 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007917 /* exponentially overallocate to minimize reallocations */
7918 if (requiredsize < 2 * oldsize)
7919 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007920 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7921 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007922 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007923 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007924 }
7925 return 0;
7926}
7927/* lookup the character, put the result in the output string and adjust
7928 various state variables. Return a new reference to the object that
7929 was put in the output buffer in *result, or Py_None, if the mapping was
7930 undefined (in which case no character was written).
7931 The called must decref result.
7932 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007933static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007934charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7935 PyObject *mapping, Py_UCS4 **output,
7936 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007937 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007938{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007939 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7940 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00007941 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007942 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007943 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007944 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007945 }
7946 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007947 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00007948 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007949 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007950 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007951 }
7952 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007953 Py_ssize_t repsize;
7954 if (PyUnicode_READY(*res) == -1)
7955 return -1;
7956 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00007957 if (repsize==1) {
7958 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007959 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00007960 }
7961 else if (repsize!=0) {
7962 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007963 Py_ssize_t requiredsize = *opos +
7964 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00007965 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007966 Py_ssize_t i;
7967 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007968 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007969 for(i = 0; i < repsize; i++)
7970 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00007971 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007972 }
7973 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007974 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007975 return 0;
7976}
7977
Alexander Belopolsky40018472011-02-26 01:02:56 +00007978PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007979_PyUnicode_TranslateCharmap(PyObject *input,
7980 PyObject *mapping,
7981 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007982{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007983 /* input object */
7984 char *idata;
7985 Py_ssize_t size, i;
7986 int kind;
7987 /* output buffer */
7988 Py_UCS4 *output = NULL;
7989 Py_ssize_t osize;
7990 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007991 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007992 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007993 char *reason = "character maps to <undefined>";
7994 PyObject *errorHandler = NULL;
7995 PyObject *exc = NULL;
7996 /* the following variable is used for caching string comparisons
7997 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7998 * 3=ignore, 4=xmlcharrefreplace */
7999 int known_errorHandler = -1;
8000
Guido van Rossumd57fd912000-03-10 22:53:23 +00008001 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008002 PyErr_BadArgument();
8003 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008004 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008005
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008006 if (PyUnicode_READY(input) == -1)
8007 return NULL;
8008 idata = (char*)PyUnicode_DATA(input);
8009 kind = PyUnicode_KIND(input);
8010 size = PyUnicode_GET_LENGTH(input);
8011 i = 0;
8012
8013 if (size == 0) {
8014 Py_INCREF(input);
8015 return input;
8016 }
8017
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008018 /* allocate enough for a simple 1:1 translation without
8019 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008020 osize = size;
8021 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8022 opos = 0;
8023 if (output == NULL) {
8024 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008025 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008026 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008027
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008028 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008029 /* try to encode it */
8030 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008031 if (charmaptranslate_output(input, i, mapping,
8032 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008033 Py_XDECREF(x);
8034 goto onError;
8035 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008036 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008037 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008038 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008039 else { /* untranslatable character */
8040 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8041 Py_ssize_t repsize;
8042 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008043 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008044 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008045 Py_ssize_t collstart = i;
8046 Py_ssize_t collend = i+1;
8047 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008048
Benjamin Peterson29060642009-01-31 22:14:21 +00008049 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008050 while (collend < size) {
8051 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008052 goto onError;
8053 Py_XDECREF(x);
8054 if (x!=Py_None)
8055 break;
8056 ++collend;
8057 }
8058 /* cache callback name lookup
8059 * (if not done yet, i.e. it's the first error) */
8060 if (known_errorHandler==-1) {
8061 if ((errors==NULL) || (!strcmp(errors, "strict")))
8062 known_errorHandler = 1;
8063 else if (!strcmp(errors, "replace"))
8064 known_errorHandler = 2;
8065 else if (!strcmp(errors, "ignore"))
8066 known_errorHandler = 3;
8067 else if (!strcmp(errors, "xmlcharrefreplace"))
8068 known_errorHandler = 4;
8069 else
8070 known_errorHandler = 0;
8071 }
8072 switch (known_errorHandler) {
8073 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008074 raise_translate_exception(&exc, input, collstart,
8075 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008076 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008077 case 2: /* replace */
8078 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008079 for (coll = collstart; coll<collend; coll++)
8080 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008081 /* fall through */
8082 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008083 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008084 break;
8085 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008086 /* generate replacement (temporarily (mis)uses i) */
8087 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008088 char buffer[2+29+1+1];
8089 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008090 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8091 if (charmaptranslate_makespace(&output, &osize,
8092 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008093 goto onError;
8094 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008095 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008096 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008097 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008098 break;
8099 default:
8100 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008101 reason, input, &exc,
8102 collstart, collend, &newpos);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02008103 if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00008104 goto onError;
8105 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008106 repsize = PyUnicode_GET_LENGTH(repunicode);
8107 if (charmaptranslate_makespace(&output, &osize,
8108 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008109 Py_DECREF(repunicode);
8110 goto onError;
8111 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008112 for (uni2 = 0; repsize-->0; ++uni2)
8113 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8114 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008115 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008116 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008117 }
8118 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008119 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8120 if (!res)
8121 goto onError;
8122 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008123 Py_XDECREF(exc);
8124 Py_XDECREF(errorHandler);
8125 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008126
Benjamin Peterson29060642009-01-31 22:14:21 +00008127 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008128 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008129 Py_XDECREF(exc);
8130 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008131 return NULL;
8132}
8133
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008134/* Deprecated. Use PyUnicode_Translate instead. */
8135PyObject *
8136PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8137 Py_ssize_t size,
8138 PyObject *mapping,
8139 const char *errors)
8140{
8141 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8142 if (!unicode)
8143 return NULL;
8144 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8145}
8146
Alexander Belopolsky40018472011-02-26 01:02:56 +00008147PyObject *
8148PyUnicode_Translate(PyObject *str,
8149 PyObject *mapping,
8150 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008151{
8152 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008153
Guido van Rossumd57fd912000-03-10 22:53:23 +00008154 str = PyUnicode_FromObject(str);
8155 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008156 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008157 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008158 Py_DECREF(str);
8159 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008160
Benjamin Peterson29060642009-01-31 22:14:21 +00008161 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008162 Py_XDECREF(str);
8163 return NULL;
8164}
Tim Petersced69f82003-09-16 20:30:58 +00008165
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008166static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008167fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008168{
8169 /* No need to call PyUnicode_READY(self) because this function is only
8170 called as a callback from fixup() which does it already. */
8171 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8172 const int kind = PyUnicode_KIND(self);
8173 void *data = PyUnicode_DATA(self);
8174 Py_UCS4 maxchar = 0, ch, fixed;
8175 Py_ssize_t i;
8176
8177 for (i = 0; i < len; ++i) {
8178 ch = PyUnicode_READ(kind, data, i);
8179 fixed = 0;
8180 if (ch > 127) {
8181 if (Py_UNICODE_ISSPACE(ch))
8182 fixed = ' ';
8183 else {
8184 const int decimal = Py_UNICODE_TODECIMAL(ch);
8185 if (decimal >= 0)
8186 fixed = '0' + decimal;
8187 }
8188 if (fixed != 0) {
8189 if (fixed > maxchar)
8190 maxchar = fixed;
8191 PyUnicode_WRITE(kind, data, i, fixed);
8192 }
8193 else if (ch > maxchar)
8194 maxchar = ch;
8195 }
8196 else if (ch > maxchar)
8197 maxchar = ch;
8198 }
8199
8200 return maxchar;
8201}
8202
8203PyObject *
8204_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8205{
8206 if (!PyUnicode_Check(unicode)) {
8207 PyErr_BadInternalCall();
8208 return NULL;
8209 }
8210 if (PyUnicode_READY(unicode) == -1)
8211 return NULL;
8212 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8213 /* If the string is already ASCII, just return the same string */
8214 Py_INCREF(unicode);
8215 return unicode;
8216 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008217 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008218}
8219
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008220PyObject *
8221PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8222 Py_ssize_t length)
8223{
8224 PyObject *result;
8225 Py_UNICODE *p; /* write pointer into result */
8226 Py_ssize_t i;
8227 /* Copy to a new string */
8228 result = (PyObject *)_PyUnicode_New(length);
8229 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
8230 if (result == NULL)
8231 return result;
8232 p = PyUnicode_AS_UNICODE(result);
8233 /* Iterate over code points */
8234 for (i = 0; i < length; i++) {
8235 Py_UNICODE ch =s[i];
8236 if (ch > 127) {
8237 int decimal = Py_UNICODE_TODECIMAL(ch);
8238 if (decimal >= 0)
8239 p[i] = '0' + decimal;
8240 }
8241 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008242#ifndef DONT_MAKE_RESULT_READY
8243 if (_PyUnicode_READY_REPLACE(&result)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008244 Py_DECREF(result);
8245 return NULL;
8246 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008247#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02008248 assert(_PyUnicode_CheckConsistency(result, 1));
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008249 return result;
8250}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008251/* --- Decimal Encoder ---------------------------------------------------- */
8252
Alexander Belopolsky40018472011-02-26 01:02:56 +00008253int
8254PyUnicode_EncodeDecimal(Py_UNICODE *s,
8255 Py_ssize_t length,
8256 char *output,
8257 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008258{
8259 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008260 PyObject *errorHandler = NULL;
8261 PyObject *exc = NULL;
8262 const char *encoding = "decimal";
8263 const char *reason = "invalid decimal Unicode string";
8264 /* the following variable is used for caching string comparisons
8265 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
8266 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008267
8268 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008269 PyErr_BadArgument();
8270 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008271 }
8272
8273 p = s;
8274 end = s + length;
8275 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008276 register Py_UNICODE ch = *p;
8277 int decimal;
8278 PyObject *repunicode;
8279 Py_ssize_t repsize;
8280 Py_ssize_t newpos;
8281 Py_UNICODE *uni2;
8282 Py_UNICODE *collstart;
8283 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00008284
Benjamin Peterson29060642009-01-31 22:14:21 +00008285 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008286 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00008287 ++p;
8288 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008289 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008290 decimal = Py_UNICODE_TODECIMAL(ch);
8291 if (decimal >= 0) {
8292 *output++ = '0' + decimal;
8293 ++p;
8294 continue;
8295 }
8296 if (0 < ch && ch < 256) {
8297 *output++ = (char)ch;
8298 ++p;
8299 continue;
8300 }
8301 /* All other characters are considered unencodable */
8302 collstart = p;
8303 collend = p+1;
8304 while (collend < end) {
8305 if ((0 < *collend && *collend < 256) ||
8306 !Py_UNICODE_ISSPACE(*collend) ||
8307 Py_UNICODE_TODECIMAL(*collend))
8308 break;
8309 }
8310 /* cache callback name lookup
8311 * (if not done yet, i.e. it's the first error) */
8312 if (known_errorHandler==-1) {
8313 if ((errors==NULL) || (!strcmp(errors, "strict")))
8314 known_errorHandler = 1;
8315 else if (!strcmp(errors, "replace"))
8316 known_errorHandler = 2;
8317 else if (!strcmp(errors, "ignore"))
8318 known_errorHandler = 3;
8319 else if (!strcmp(errors, "xmlcharrefreplace"))
8320 known_errorHandler = 4;
8321 else
8322 known_errorHandler = 0;
8323 }
8324 switch (known_errorHandler) {
8325 case 1: /* strict */
8326 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
8327 goto onError;
8328 case 2: /* replace */
8329 for (p = collstart; p < collend; ++p)
8330 *output++ = '?';
8331 /* fall through */
8332 case 3: /* ignore */
8333 p = collend;
8334 break;
8335 case 4: /* xmlcharrefreplace */
8336 /* generate replacement (temporarily (mis)uses p) */
8337 for (p = collstart; p < collend; ++p)
8338 output += sprintf(output, "&#%d;", (int)*p);
8339 p = collend;
8340 break;
8341 default:
8342 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
8343 encoding, reason, s, length, &exc,
8344 collstart-s, collend-s, &newpos);
8345 if (repunicode == NULL)
8346 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008347 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008348 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008349 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8350 Py_DECREF(repunicode);
8351 goto onError;
8352 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008353 /* generate replacement */
8354 repsize = PyUnicode_GET_SIZE(repunicode);
8355 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8356 Py_UNICODE ch = *uni2;
8357 if (Py_UNICODE_ISSPACE(ch))
8358 *output++ = ' ';
8359 else {
8360 decimal = Py_UNICODE_TODECIMAL(ch);
8361 if (decimal >= 0)
8362 *output++ = '0' + decimal;
8363 else if (0 < ch && ch < 256)
8364 *output++ = (char)ch;
8365 else {
8366 Py_DECREF(repunicode);
8367 raise_encode_exception(&exc, encoding,
8368 s, length, collstart-s, collend-s, reason);
8369 goto onError;
8370 }
8371 }
8372 }
8373 p = s + newpos;
8374 Py_DECREF(repunicode);
8375 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00008376 }
8377 /* 0-terminate the output string */
8378 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008379 Py_XDECREF(exc);
8380 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008381 return 0;
8382
Benjamin Peterson29060642009-01-31 22:14:21 +00008383 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008384 Py_XDECREF(exc);
8385 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008386 return -1;
8387}
8388
Guido van Rossumd57fd912000-03-10 22:53:23 +00008389/* --- Helpers ------------------------------------------------------------ */
8390
Victor Stinnerc3cec782011-10-05 21:24:08 +02008391#include "stringlib/asciilib.h"
8392#include "stringlib/fastsearch.h"
8393#include "stringlib/partition.h"
8394#include "stringlib/split.h"
8395#include "stringlib/count.h"
8396#include "stringlib/find.h"
8397#include "stringlib/localeutil.h"
8398#include "stringlib/undef.h"
8399
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008400#include "stringlib/ucs1lib.h"
8401#include "stringlib/fastsearch.h"
8402#include "stringlib/partition.h"
8403#include "stringlib/split.h"
8404#include "stringlib/count.h"
8405#include "stringlib/find.h"
8406#include "stringlib/localeutil.h"
8407#include "stringlib/undef.h"
8408
8409#include "stringlib/ucs2lib.h"
8410#include "stringlib/fastsearch.h"
8411#include "stringlib/partition.h"
8412#include "stringlib/split.h"
8413#include "stringlib/count.h"
8414#include "stringlib/find.h"
8415#include "stringlib/localeutil.h"
8416#include "stringlib/undef.h"
8417
8418#include "stringlib/ucs4lib.h"
8419#include "stringlib/fastsearch.h"
8420#include "stringlib/partition.h"
8421#include "stringlib/split.h"
8422#include "stringlib/count.h"
8423#include "stringlib/find.h"
8424#include "stringlib/localeutil.h"
8425#include "stringlib/undef.h"
8426
8427static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02008428any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ascii)(const Py_UCS1*, Py_ssize_t,
8429 const Py_UCS1*, Py_ssize_t,
8430 Py_ssize_t, Py_ssize_t),
8431 Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008432 const Py_UCS1*, Py_ssize_t,
8433 Py_ssize_t, Py_ssize_t),
8434 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
8435 const Py_UCS2*, Py_ssize_t,
8436 Py_ssize_t, Py_ssize_t),
8437 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
8438 const Py_UCS4*, Py_ssize_t,
8439 Py_ssize_t, Py_ssize_t),
8440 PyObject* s1, PyObject* s2,
8441 Py_ssize_t start,
8442 Py_ssize_t end)
8443{
8444 int kind1, kind2, kind;
8445 void *buf1, *buf2;
8446 Py_ssize_t len1, len2, result;
8447
8448 kind1 = PyUnicode_KIND(s1);
8449 kind2 = PyUnicode_KIND(s2);
8450 kind = kind1 > kind2 ? kind1 : kind2;
8451 buf1 = PyUnicode_DATA(s1);
8452 buf2 = PyUnicode_DATA(s2);
8453 if (kind1 != kind)
8454 buf1 = _PyUnicode_AsKind(s1, kind);
8455 if (!buf1)
8456 return -2;
8457 if (kind2 != kind)
8458 buf2 = _PyUnicode_AsKind(s2, kind);
8459 if (!buf2) {
8460 if (kind1 != kind) PyMem_Free(buf1);
8461 return -2;
8462 }
8463 len1 = PyUnicode_GET_LENGTH(s1);
8464 len2 = PyUnicode_GET_LENGTH(s2);
8465
8466 switch(kind) {
8467 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008468 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8469 result = ascii(buf1, len1, buf2, len2, start, end);
8470 else
8471 result = ucs1(buf1, len1, buf2, len2, start, end);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008472 break;
8473 case PyUnicode_2BYTE_KIND:
8474 result = ucs2(buf1, len1, buf2, len2, start, end);
8475 break;
8476 case PyUnicode_4BYTE_KIND:
8477 result = ucs4(buf1, len1, buf2, len2, start, end);
8478 break;
8479 default:
8480 assert(0); result = -2;
8481 }
8482
8483 if (kind1 != kind)
8484 PyMem_Free(buf1);
8485 if (kind2 != kind)
8486 PyMem_Free(buf2);
8487
8488 return result;
8489}
8490
8491Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02008492_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008493 Py_ssize_t n_buffer,
8494 void *digits, Py_ssize_t n_digits,
8495 Py_ssize_t min_width,
8496 const char *grouping,
8497 const char *thousands_sep)
8498{
8499 switch(kind) {
8500 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008501 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
8502 return _PyUnicode_ascii_InsertThousandsGrouping(
8503 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8504 min_width, grouping, thousands_sep);
8505 else
8506 return _PyUnicode_ucs1_InsertThousandsGrouping(
8507 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8508 min_width, grouping, thousands_sep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008509 case PyUnicode_2BYTE_KIND:
8510 return _PyUnicode_ucs2_InsertThousandsGrouping(
8511 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8512 min_width, grouping, thousands_sep);
8513 case PyUnicode_4BYTE_KIND:
8514 return _PyUnicode_ucs4_InsertThousandsGrouping(
8515 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8516 min_width, grouping, thousands_sep);
8517 }
8518 assert(0);
8519 return -1;
8520}
8521
8522
Eric Smith8c663262007-08-25 02:26:07 +00008523#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00008524#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008525
Thomas Wouters477c8d52006-05-27 19:21:47 +00008526#include "stringlib/count.h"
8527#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00008528
Thomas Wouters477c8d52006-05-27 19:21:47 +00008529/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008530#define ADJUST_INDICES(start, end, len) \
8531 if (end > len) \
8532 end = len; \
8533 else if (end < 0) { \
8534 end += len; \
8535 if (end < 0) \
8536 end = 0; \
8537 } \
8538 if (start < 0) { \
8539 start += len; \
8540 if (start < 0) \
8541 start = 0; \
8542 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008543
Alexander Belopolsky40018472011-02-26 01:02:56 +00008544Py_ssize_t
8545PyUnicode_Count(PyObject *str,
8546 PyObject *substr,
8547 Py_ssize_t start,
8548 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008549{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008550 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008551 PyUnicodeObject* str_obj;
8552 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008553 int kind1, kind2, kind;
8554 void *buf1 = NULL, *buf2 = NULL;
8555 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008556
Thomas Wouters477c8d52006-05-27 19:21:47 +00008557 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008558 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008559 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008560 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02008561 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008562 Py_DECREF(str_obj);
8563 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008564 }
Tim Petersced69f82003-09-16 20:30:58 +00008565
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008566 kind1 = PyUnicode_KIND(str_obj);
8567 kind2 = PyUnicode_KIND(sub_obj);
8568 kind = kind1 > kind2 ? kind1 : kind2;
8569 buf1 = PyUnicode_DATA(str_obj);
8570 if (kind1 != kind)
8571 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
8572 if (!buf1)
8573 goto onError;
8574 buf2 = PyUnicode_DATA(sub_obj);
8575 if (kind2 != kind)
8576 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
8577 if (!buf2)
8578 goto onError;
8579 len1 = PyUnicode_GET_LENGTH(str_obj);
8580 len2 = PyUnicode_GET_LENGTH(sub_obj);
8581
8582 ADJUST_INDICES(start, end, len1);
8583 switch(kind) {
8584 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008585 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
8586 result = asciilib_count(
8587 ((Py_UCS1*)buf1) + start, end - start,
8588 buf2, len2, PY_SSIZE_T_MAX
8589 );
8590 else
8591 result = ucs1lib_count(
8592 ((Py_UCS1*)buf1) + start, end - start,
8593 buf2, len2, PY_SSIZE_T_MAX
8594 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008595 break;
8596 case PyUnicode_2BYTE_KIND:
8597 result = ucs2lib_count(
8598 ((Py_UCS2*)buf1) + start, end - start,
8599 buf2, len2, PY_SSIZE_T_MAX
8600 );
8601 break;
8602 case PyUnicode_4BYTE_KIND:
8603 result = ucs4lib_count(
8604 ((Py_UCS4*)buf1) + start, end - start,
8605 buf2, len2, PY_SSIZE_T_MAX
8606 );
8607 break;
8608 default:
8609 assert(0); result = 0;
8610 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008611
8612 Py_DECREF(sub_obj);
8613 Py_DECREF(str_obj);
8614
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008615 if (kind1 != kind)
8616 PyMem_Free(buf1);
8617 if (kind2 != kind)
8618 PyMem_Free(buf2);
8619
Guido van Rossumd57fd912000-03-10 22:53:23 +00008620 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008621 onError:
8622 Py_DECREF(sub_obj);
8623 Py_DECREF(str_obj);
8624 if (kind1 != kind && buf1)
8625 PyMem_Free(buf1);
8626 if (kind2 != kind && buf2)
8627 PyMem_Free(buf2);
8628 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008629}
8630
Alexander Belopolsky40018472011-02-26 01:02:56 +00008631Py_ssize_t
8632PyUnicode_Find(PyObject *str,
8633 PyObject *sub,
8634 Py_ssize_t start,
8635 Py_ssize_t end,
8636 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008637{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008638 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008639
Guido van Rossumd57fd912000-03-10 22:53:23 +00008640 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008641 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008642 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008643 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008644 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008645 Py_DECREF(str);
8646 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008647 }
Tim Petersced69f82003-09-16 20:30:58 +00008648
Thomas Wouters477c8d52006-05-27 19:21:47 +00008649 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008650 result = any_find_slice(
Victor Stinnerc3cec782011-10-05 21:24:08 +02008651 asciilib_find_slice, ucs1lib_find_slice,
8652 ucs2lib_find_slice, ucs4lib_find_slice,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008653 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008654 );
8655 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008656 result = any_find_slice(
Victor Stinnerc3cec782011-10-05 21:24:08 +02008657 asciilib_find_slice, ucs1lib_rfind_slice,
8658 ucs2lib_rfind_slice, ucs4lib_rfind_slice,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008659 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008660 );
8661
Guido van Rossumd57fd912000-03-10 22:53:23 +00008662 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008663 Py_DECREF(sub);
8664
Guido van Rossumd57fd912000-03-10 22:53:23 +00008665 return result;
8666}
8667
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008668Py_ssize_t
8669PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8670 Py_ssize_t start, Py_ssize_t end,
8671 int direction)
8672{
8673 char *result;
8674 int kind;
8675 if (PyUnicode_READY(str) == -1)
8676 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008677 if (start < 0 || end < 0) {
8678 PyErr_SetString(PyExc_IndexError, "string index out of range");
8679 return -2;
8680 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008681 if (end > PyUnicode_GET_LENGTH(str))
8682 end = PyUnicode_GET_LENGTH(str);
8683 kind = PyUnicode_KIND(str);
8684 result = findchar(PyUnicode_1BYTE_DATA(str)
8685 + PyUnicode_KIND_SIZE(kind, start),
8686 kind,
8687 end-start, ch, direction);
8688 if (!result)
8689 return -1;
8690 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8691}
8692
Alexander Belopolsky40018472011-02-26 01:02:56 +00008693static int
8694tailmatch(PyUnicodeObject *self,
8695 PyUnicodeObject *substring,
8696 Py_ssize_t start,
8697 Py_ssize_t end,
8698 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008699{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008700 int kind_self;
8701 int kind_sub;
8702 void *data_self;
8703 void *data_sub;
8704 Py_ssize_t offset;
8705 Py_ssize_t i;
8706 Py_ssize_t end_sub;
8707
8708 if (PyUnicode_READY(self) == -1 ||
8709 PyUnicode_READY(substring) == -1)
8710 return 0;
8711
8712 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008713 return 1;
8714
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008715 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8716 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008717 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008718 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008719
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008720 kind_self = PyUnicode_KIND(self);
8721 data_self = PyUnicode_DATA(self);
8722 kind_sub = PyUnicode_KIND(substring);
8723 data_sub = PyUnicode_DATA(substring);
8724 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8725
8726 if (direction > 0)
8727 offset = end;
8728 else
8729 offset = start;
8730
8731 if (PyUnicode_READ(kind_self, data_self, offset) ==
8732 PyUnicode_READ(kind_sub, data_sub, 0) &&
8733 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8734 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8735 /* If both are of the same kind, memcmp is sufficient */
8736 if (kind_self == kind_sub) {
8737 return ! memcmp((char *)data_self +
8738 (offset * PyUnicode_CHARACTER_SIZE(substring)),
8739 data_sub,
8740 PyUnicode_GET_LENGTH(substring) *
8741 PyUnicode_CHARACTER_SIZE(substring));
8742 }
8743 /* otherwise we have to compare each character by first accesing it */
8744 else {
8745 /* We do not need to compare 0 and len(substring)-1 because
8746 the if statement above ensured already that they are equal
8747 when we end up here. */
8748 // TODO: honor direction and do a forward or backwards search
8749 for (i = 1; i < end_sub; ++i) {
8750 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8751 PyUnicode_READ(kind_sub, data_sub, i))
8752 return 0;
8753 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008754 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008755 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008756 }
8757
8758 return 0;
8759}
8760
Alexander Belopolsky40018472011-02-26 01:02:56 +00008761Py_ssize_t
8762PyUnicode_Tailmatch(PyObject *str,
8763 PyObject *substr,
8764 Py_ssize_t start,
8765 Py_ssize_t end,
8766 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008767{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008768 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008769
Guido van Rossumd57fd912000-03-10 22:53:23 +00008770 str = PyUnicode_FromObject(str);
8771 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008772 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008773 substr = PyUnicode_FromObject(substr);
8774 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008775 Py_DECREF(str);
8776 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008777 }
Tim Petersced69f82003-09-16 20:30:58 +00008778
Guido van Rossumd57fd912000-03-10 22:53:23 +00008779 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008780 (PyUnicodeObject *)substr,
8781 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008782 Py_DECREF(str);
8783 Py_DECREF(substr);
8784 return result;
8785}
8786
Guido van Rossumd57fd912000-03-10 22:53:23 +00008787/* Apply fixfct filter to the Unicode object self and return a
8788 reference to the modified object */
8789
Alexander Belopolsky40018472011-02-26 01:02:56 +00008790static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02008791fixup(PyObject *self,
8792 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008793{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008794 PyObject *u;
8795 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008796
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008797 if (PyUnicode_READY(self) == -1)
8798 return NULL;
8799 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8800 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8801 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008802 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008803 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008804
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008805 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8806 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008807
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008808 /* fix functions return the new maximum character in a string,
8809 if the kind of the resulting unicode object does not change,
8810 everything is fine. Otherwise we need to change the string kind
8811 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02008812 maxchar_new = fixfct(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008813 if (maxchar_new == 0)
8814 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8815 else if (maxchar_new <= 127)
8816 maxchar_new = 127;
8817 else if (maxchar_new <= 255)
8818 maxchar_new = 255;
8819 else if (maxchar_new <= 65535)
8820 maxchar_new = 65535;
8821 else
8822 maxchar_new = 1114111; /* 0x10ffff */
8823
8824 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008825 /* fixfct should return TRUE if it modified the buffer. If
8826 FALSE, return a reference to the original buffer instead
8827 (to save space, not time) */
8828 Py_INCREF(self);
8829 Py_DECREF(u);
8830 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008831 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008832 else if (maxchar_new == maxchar_old) {
8833 return u;
8834 }
8835 else {
8836 /* In case the maximum character changed, we need to
8837 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008838 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008839 if (v == NULL) {
8840 Py_DECREF(u);
8841 return NULL;
8842 }
8843 if (maxchar_new > maxchar_old) {
8844 /* If the maxchar increased so that the kind changed, not all
8845 characters are representable anymore and we need to fix the
8846 string again. This only happens in very few cases. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02008847 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner9310abb2011-10-05 00:59:23 +02008848 maxchar_old = fixfct(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008849 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8850 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008851 else {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02008852 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008853 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008854
8855 Py_DECREF(u);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02008856 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008857 return v;
8858 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008859}
8860
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008861static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008862fixupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008863{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008864 /* No need to call PyUnicode_READY(self) because this function is only
8865 called as a callback from fixup() which does it already. */
8866 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8867 const int kind = PyUnicode_KIND(self);
8868 void *data = PyUnicode_DATA(self);
8869 int touched = 0;
8870 Py_UCS4 maxchar = 0;
8871 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008872
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008873 for (i = 0; i < len; ++i) {
8874 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8875 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8876 if (up != ch) {
8877 if (up > maxchar)
8878 maxchar = up;
8879 PyUnicode_WRITE(kind, data, i, up);
8880 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008881 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008882 else if (ch > maxchar)
8883 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008884 }
8885
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008886 if (touched)
8887 return maxchar;
8888 else
8889 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008890}
8891
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008892static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008893fixlower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008894{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008895 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8896 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8897 const int kind = PyUnicode_KIND(self);
8898 void *data = PyUnicode_DATA(self);
8899 int touched = 0;
8900 Py_UCS4 maxchar = 0;
8901 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008902
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008903 for(i = 0; i < len; ++i) {
8904 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8905 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8906 if (lo != ch) {
8907 if (lo > maxchar)
8908 maxchar = lo;
8909 PyUnicode_WRITE(kind, data, i, lo);
8910 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008911 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008912 else if (ch > maxchar)
8913 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008914 }
8915
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008916 if (touched)
8917 return maxchar;
8918 else
8919 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008920}
8921
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008922static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008923fixswapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008924{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008925 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8926 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8927 const int kind = PyUnicode_KIND(self);
8928 void *data = PyUnicode_DATA(self);
8929 int touched = 0;
8930 Py_UCS4 maxchar = 0;
8931 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008932
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008933 for(i = 0; i < len; ++i) {
8934 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8935 Py_UCS4 nu = 0;
8936
8937 if (Py_UNICODE_ISUPPER(ch))
8938 nu = Py_UNICODE_TOLOWER(ch);
8939 else if (Py_UNICODE_ISLOWER(ch))
8940 nu = Py_UNICODE_TOUPPER(ch);
8941
8942 if (nu != 0) {
8943 if (nu > maxchar)
8944 maxchar = nu;
8945 PyUnicode_WRITE(kind, data, i, nu);
8946 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008947 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008948 else if (ch > maxchar)
8949 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008950 }
8951
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008952 if (touched)
8953 return maxchar;
8954 else
8955 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008956}
8957
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008958static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008959fixcapitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008960{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008961 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8962 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8963 const int kind = PyUnicode_KIND(self);
8964 void *data = PyUnicode_DATA(self);
8965 int touched = 0;
8966 Py_UCS4 maxchar = 0;
8967 Py_ssize_t i = 0;
8968 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00008969
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008970 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008971 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008972
8973 ch = PyUnicode_READ(kind, data, i);
8974 if (!Py_UNICODE_ISUPPER(ch)) {
8975 maxchar = Py_UNICODE_TOUPPER(ch);
8976 PyUnicode_WRITE(kind, data, i, maxchar);
8977 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008978 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008979 ++i;
8980 for(; i < len; ++i) {
8981 ch = PyUnicode_READ(kind, data, i);
8982 if (!Py_UNICODE_ISLOWER(ch)) {
8983 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8984 if (lo > maxchar)
8985 maxchar = lo;
8986 PyUnicode_WRITE(kind, data, i, lo);
8987 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008988 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008989 else if (ch > maxchar)
8990 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008991 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008992
8993 if (touched)
8994 return maxchar;
8995 else
8996 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008997}
8998
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008999static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009000fixtitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009001{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009002 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9003 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9004 const int kind = PyUnicode_KIND(self);
9005 void *data = PyUnicode_DATA(self);
9006 Py_UCS4 maxchar = 0;
9007 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009008 int previous_is_cased;
9009
9010 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009011 if (len == 1) {
9012 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9013 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
9014 if (ti != ch) {
9015 PyUnicode_WRITE(kind, data, i, ti);
9016 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00009017 }
9018 else
9019 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009020 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009021 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009022 for(; i < len; ++i) {
9023 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9024 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00009025
Benjamin Peterson29060642009-01-31 22:14:21 +00009026 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009027 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00009028 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009029 nu = Py_UNICODE_TOTITLE(ch);
9030
9031 if (nu > maxchar)
9032 maxchar = nu;
9033 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00009034
Benjamin Peterson29060642009-01-31 22:14:21 +00009035 if (Py_UNICODE_ISLOWER(ch) ||
9036 Py_UNICODE_ISUPPER(ch) ||
9037 Py_UNICODE_ISTITLE(ch))
9038 previous_is_cased = 1;
9039 else
9040 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009041 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009042 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009043}
9044
Tim Peters8ce9f162004-08-27 01:49:32 +00009045PyObject *
9046PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009047{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009048 PyObject *sep = NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009049 Py_ssize_t seplen = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009050 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009051 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009052 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9053 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009054 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009055 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009056 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009057 Py_UCS4 item_maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009058
Tim Peters05eba1f2004-08-27 21:32:02 +00009059 fseq = PySequence_Fast(seq, "");
9060 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009061 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009062 }
9063
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009064 /* NOTE: the following code can't call back into Python code,
9065 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009066 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009067
Tim Peters05eba1f2004-08-27 21:32:02 +00009068 seqlen = PySequence_Fast_GET_SIZE(fseq);
9069 /* If empty sequence, return u"". */
9070 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009071 Py_DECREF(fseq);
9072 Py_INCREF(unicode_empty);
9073 res = unicode_empty;
9074 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009075 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009076
Tim Peters05eba1f2004-08-27 21:32:02 +00009077 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009078 items = PySequence_Fast_ITEMS(fseq);
9079 if (seqlen == 1 && PyUnicode_CheckExact(items[0])) {
9080 res = items[0];
9081 Py_INCREF(res);
9082 Py_DECREF(fseq);
9083 return res;
9084 }
9085
9086 /* Set up sep and seplen */
9087 if (separator == NULL) {
9088 /* fall back to a blank space separator */
9089 sep = PyUnicode_FromOrdinal(' ');
9090 if (!sep)
9091 goto onError;
9092 maxchar = 32;
Tim Peters8ce9f162004-08-27 01:49:32 +00009093 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009094 else {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009095 if (!PyUnicode_Check(separator)) {
9096 PyErr_Format(PyExc_TypeError,
9097 "separator: expected str instance,"
9098 " %.80s found",
9099 Py_TYPE(separator)->tp_name);
9100 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00009101 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009102 if (PyUnicode_READY(separator))
9103 goto onError;
9104 sep = separator;
9105 seplen = PyUnicode_GET_LENGTH(separator);
9106 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9107 /* inc refcount to keep this code path symmetric with the
9108 above case of a blank separator */
9109 Py_INCREF(sep);
Tim Peters05eba1f2004-08-27 21:32:02 +00009110 }
9111
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009112 /* There are at least two things to join, or else we have a subclass
9113 * of str in the sequence.
9114 * Do a pre-pass to figure out the total amount of space we'll
9115 * need (sz), and see whether all argument are strings.
9116 */
9117 sz = 0;
9118 for (i = 0; i < seqlen; i++) {
9119 const Py_ssize_t old_sz = sz;
9120 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009121 if (!PyUnicode_Check(item)) {
9122 PyErr_Format(PyExc_TypeError,
9123 "sequence item %zd: expected str instance,"
9124 " %.80s found",
9125 i, Py_TYPE(item)->tp_name);
9126 goto onError;
9127 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009128 if (PyUnicode_READY(item) == -1)
9129 goto onError;
9130 sz += PyUnicode_GET_LENGTH(item);
9131 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
9132 if (item_maxchar > maxchar)
9133 maxchar = item_maxchar;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009134 if (i != 0)
9135 sz += seplen;
9136 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9137 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009138 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009139 goto onError;
9140 }
9141 }
Tim Petersced69f82003-09-16 20:30:58 +00009142
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009143 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009144 if (res == NULL)
9145 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009146
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009147 /* Catenate everything. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009148 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009149 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009150 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009151 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009152 if (i && seplen != 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009153 copy_characters(res, res_offset, sep, 0, seplen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009154 res_offset += seplen;
Benjamin Peterson29060642009-01-31 22:14:21 +00009155 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009156 itemlen = PyUnicode_GET_LENGTH(item);
9157 if (itemlen != 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009158 copy_characters(res, res_offset, item, 0, itemlen);
Victor Stinner9ce5a832011-10-03 23:36:02 +02009159 res_offset += itemlen;
Victor Stinner9ce5a832011-10-03 23:36:02 +02009160 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009161 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009162 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009163
Tim Peters05eba1f2004-08-27 21:32:02 +00009164 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009165 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009166 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009167 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009168
Benjamin Peterson29060642009-01-31 22:14:21 +00009169 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009170 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009171 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009172 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009173 return NULL;
9174}
9175
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009176#define FILL(kind, data, value, start, length) \
9177 do { \
9178 Py_ssize_t i_ = 0; \
9179 assert(kind != PyUnicode_WCHAR_KIND); \
9180 switch ((kind)) { \
9181 case PyUnicode_1BYTE_KIND: { \
9182 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9183 memset(to_, (unsigned char)value, length); \
9184 break; \
9185 } \
9186 case PyUnicode_2BYTE_KIND: { \
9187 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9188 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9189 break; \
9190 } \
9191 default: { \
9192 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9193 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9194 break; \
9195 } \
9196 } \
9197 } while (0)
9198
Victor Stinner9310abb2011-10-05 00:59:23 +02009199static PyObject *
9200pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009201 Py_ssize_t left,
9202 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009203 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009204{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009205 PyObject *u;
9206 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009207 int kind;
9208 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009209
9210 if (left < 0)
9211 left = 0;
9212 if (right < 0)
9213 right = 0;
9214
Tim Peters7a29bd52001-09-12 03:03:31 +00009215 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009216 Py_INCREF(self);
9217 return self;
9218 }
9219
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009220 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9221 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009222 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9223 return NULL;
9224 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009225 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9226 if (fill > maxchar)
9227 maxchar = fill;
9228 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009229 if (!u)
9230 return NULL;
9231
9232 kind = PyUnicode_KIND(u);
9233 data = PyUnicode_DATA(u);
9234 if (left)
9235 FILL(kind, data, fill, 0, left);
9236 if (right)
9237 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009238 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009239 assert(_PyUnicode_CheckConsistency(u, 1));
9240 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009241}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009242#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009243
Alexander Belopolsky40018472011-02-26 01:02:56 +00009244PyObject *
9245PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009246{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009247 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009248
9249 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009250 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009251 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009252
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009253 switch(PyUnicode_KIND(string)) {
9254 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009255 if (PyUnicode_IS_ASCII(string))
9256 list = asciilib_splitlines(
9257 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9258 PyUnicode_GET_LENGTH(string), keepends);
9259 else
9260 list = ucs1lib_splitlines(
9261 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9262 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009263 break;
9264 case PyUnicode_2BYTE_KIND:
9265 list = ucs2lib_splitlines(
9266 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
9267 PyUnicode_GET_LENGTH(string), keepends);
9268 break;
9269 case PyUnicode_4BYTE_KIND:
9270 list = ucs4lib_splitlines(
9271 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
9272 PyUnicode_GET_LENGTH(string), keepends);
9273 break;
9274 default:
9275 assert(0);
9276 list = 0;
9277 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009278 Py_DECREF(string);
9279 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009280}
9281
Alexander Belopolsky40018472011-02-26 01:02:56 +00009282static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009283split(PyObject *self,
9284 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009285 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009286{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009287 int kind1, kind2, kind;
9288 void *buf1, *buf2;
9289 Py_ssize_t len1, len2;
9290 PyObject* out;
9291
Guido van Rossumd57fd912000-03-10 22:53:23 +00009292 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009293 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009294
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009295 if (PyUnicode_READY(self) == -1)
9296 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009297
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009298 if (substring == NULL)
9299 switch(PyUnicode_KIND(self)) {
9300 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009301 if (PyUnicode_IS_ASCII(self))
9302 return asciilib_split_whitespace(
9303 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9304 PyUnicode_GET_LENGTH(self), maxcount
9305 );
9306 else
9307 return ucs1lib_split_whitespace(
9308 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9309 PyUnicode_GET_LENGTH(self), maxcount
9310 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009311 case PyUnicode_2BYTE_KIND:
9312 return ucs2lib_split_whitespace(
9313 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9314 PyUnicode_GET_LENGTH(self), maxcount
9315 );
9316 case PyUnicode_4BYTE_KIND:
9317 return ucs4lib_split_whitespace(
9318 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9319 PyUnicode_GET_LENGTH(self), maxcount
9320 );
9321 default:
9322 assert(0);
9323 return NULL;
9324 }
9325
9326 if (PyUnicode_READY(substring) == -1)
9327 return NULL;
9328
9329 kind1 = PyUnicode_KIND(self);
9330 kind2 = PyUnicode_KIND(substring);
9331 kind = kind1 > kind2 ? kind1 : kind2;
9332 buf1 = PyUnicode_DATA(self);
9333 buf2 = PyUnicode_DATA(substring);
9334 if (kind1 != kind)
9335 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9336 if (!buf1)
9337 return NULL;
9338 if (kind2 != kind)
9339 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9340 if (!buf2) {
9341 if (kind1 != kind) PyMem_Free(buf1);
9342 return NULL;
9343 }
9344 len1 = PyUnicode_GET_LENGTH(self);
9345 len2 = PyUnicode_GET_LENGTH(substring);
9346
9347 switch(kind) {
9348 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009349 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9350 out = asciilib_split(
9351 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9352 else
9353 out = ucs1lib_split(
9354 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009355 break;
9356 case PyUnicode_2BYTE_KIND:
9357 out = ucs2lib_split(
9358 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9359 break;
9360 case PyUnicode_4BYTE_KIND:
9361 out = ucs4lib_split(
9362 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9363 break;
9364 default:
9365 out = NULL;
9366 }
9367 if (kind1 != kind)
9368 PyMem_Free(buf1);
9369 if (kind2 != kind)
9370 PyMem_Free(buf2);
9371 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009372}
9373
Alexander Belopolsky40018472011-02-26 01:02:56 +00009374static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009375rsplit(PyObject *self,
9376 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009377 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009378{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009379 int kind1, kind2, kind;
9380 void *buf1, *buf2;
9381 Py_ssize_t len1, len2;
9382 PyObject* out;
9383
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009384 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009385 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009386
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009387 if (PyUnicode_READY(self) == -1)
9388 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009389
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009390 if (substring == NULL)
9391 switch(PyUnicode_KIND(self)) {
9392 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009393 if (PyUnicode_IS_ASCII(self))
9394 return asciilib_rsplit_whitespace(
9395 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9396 PyUnicode_GET_LENGTH(self), maxcount
9397 );
9398 else
9399 return ucs1lib_rsplit_whitespace(
9400 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9401 PyUnicode_GET_LENGTH(self), maxcount
9402 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009403 case PyUnicode_2BYTE_KIND:
9404 return ucs2lib_rsplit_whitespace(
9405 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9406 PyUnicode_GET_LENGTH(self), maxcount
9407 );
9408 case PyUnicode_4BYTE_KIND:
9409 return ucs4lib_rsplit_whitespace(
9410 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9411 PyUnicode_GET_LENGTH(self), maxcount
9412 );
9413 default:
9414 assert(0);
9415 return NULL;
9416 }
9417
9418 if (PyUnicode_READY(substring) == -1)
9419 return NULL;
9420
9421 kind1 = PyUnicode_KIND(self);
9422 kind2 = PyUnicode_KIND(substring);
9423 kind = kind1 > kind2 ? kind1 : kind2;
9424 buf1 = PyUnicode_DATA(self);
9425 buf2 = PyUnicode_DATA(substring);
9426 if (kind1 != kind)
9427 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9428 if (!buf1)
9429 return NULL;
9430 if (kind2 != kind)
9431 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9432 if (!buf2) {
9433 if (kind1 != kind) PyMem_Free(buf1);
9434 return NULL;
9435 }
9436 len1 = PyUnicode_GET_LENGTH(self);
9437 len2 = PyUnicode_GET_LENGTH(substring);
9438
9439 switch(kind) {
9440 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009441 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9442 out = asciilib_rsplit(
9443 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9444 else
9445 out = ucs1lib_rsplit(
9446 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009447 break;
9448 case PyUnicode_2BYTE_KIND:
9449 out = ucs2lib_rsplit(
9450 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9451 break;
9452 case PyUnicode_4BYTE_KIND:
9453 out = ucs4lib_rsplit(
9454 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9455 break;
9456 default:
9457 out = NULL;
9458 }
9459 if (kind1 != kind)
9460 PyMem_Free(buf1);
9461 if (kind2 != kind)
9462 PyMem_Free(buf2);
9463 return out;
9464}
9465
9466static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009467anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9468 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009469{
9470 switch(kind) {
9471 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009472 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9473 return asciilib_find(buf1, len1, buf2, len2, offset);
9474 else
9475 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009476 case PyUnicode_2BYTE_KIND:
9477 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9478 case PyUnicode_4BYTE_KIND:
9479 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9480 }
9481 assert(0);
9482 return -1;
9483}
9484
9485static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009486anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
9487 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009488{
9489 switch(kind) {
9490 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009491 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
9492 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
9493 else
9494 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009495 case PyUnicode_2BYTE_KIND:
9496 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9497 case PyUnicode_4BYTE_KIND:
9498 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9499 }
9500 assert(0);
9501 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009502}
9503
Alexander Belopolsky40018472011-02-26 01:02:56 +00009504static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009505replace(PyObject *self, PyObject *str1,
9506 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009507{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009508 PyObject *u;
9509 char *sbuf = PyUnicode_DATA(self);
9510 char *buf1 = PyUnicode_DATA(str1);
9511 char *buf2 = PyUnicode_DATA(str2);
9512 int srelease = 0, release1 = 0, release2 = 0;
9513 int skind = PyUnicode_KIND(self);
9514 int kind1 = PyUnicode_KIND(str1);
9515 int kind2 = PyUnicode_KIND(str2);
9516 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9517 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9518 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009519
9520 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009521 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009522 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009523 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009524
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009525 if (skind < kind1)
9526 /* substring too wide to be present */
9527 goto nothing;
9528
9529 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00009530 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009531 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009532 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009533 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009534 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009535 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009536 Py_UCS4 u1, u2, maxchar;
9537 int mayshrink, rkind;
9538 u1 = PyUnicode_READ_CHAR(str1, 0);
9539 if (!findchar(sbuf, PyUnicode_KIND(self),
9540 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00009541 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009542 u2 = PyUnicode_READ_CHAR(str2, 0);
9543 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9544 /* Replacing u1 with u2 may cause a maxchar reduction in the
9545 result string. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009546 if (u2 > maxchar) {
9547 maxchar = u2;
9548 mayshrink = 0;
9549 }
Victor Stinnerb9275c12011-10-05 14:01:42 +02009550 else
9551 mayshrink = maxchar > 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009552 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009553 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009554 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009555 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009556 rkind = PyUnicode_KIND(u);
9557 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
9558 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009559 if (--maxcount < 0)
9560 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009561 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009562 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009563 if (mayshrink) {
9564 PyObject *tmp = u;
9565 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
9566 PyUnicode_GET_LENGTH(tmp));
9567 Py_DECREF(tmp);
9568 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009569 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009570 int rkind = skind;
9571 char *res;
9572 if (kind1 < rkind) {
9573 /* widen substring */
9574 buf1 = _PyUnicode_AsKind(str1, rkind);
9575 if (!buf1) goto error;
9576 release1 = 1;
9577 }
Victor Stinnerc3cec782011-10-05 21:24:08 +02009578 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009579 if (i < 0)
9580 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009581 if (rkind > kind2) {
9582 /* widen replacement */
9583 buf2 = _PyUnicode_AsKind(str2, rkind);
9584 if (!buf2) goto error;
9585 release2 = 1;
9586 }
9587 else if (rkind < kind2) {
9588 /* widen self and buf1 */
9589 rkind = kind2;
9590 if (release1) PyMem_Free(buf1);
9591 sbuf = _PyUnicode_AsKind(self, rkind);
9592 if (!sbuf) goto error;
9593 srelease = 1;
9594 buf1 = _PyUnicode_AsKind(str1, rkind);
9595 if (!buf1) goto error;
9596 release1 = 1;
9597 }
9598 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
9599 if (!res) {
9600 PyErr_NoMemory();
9601 goto error;
9602 }
9603 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009604 /* change everything in-place, starting with this one */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009605 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9606 buf2,
9607 PyUnicode_KIND_SIZE(rkind, len2));
9608 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009609
9610 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +02009611 i = anylib_find(rkind, self,
9612 sbuf+PyUnicode_KIND_SIZE(rkind, i), slen-i,
9613 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009614 if (i == -1)
9615 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009616 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9617 buf2,
9618 PyUnicode_KIND_SIZE(rkind, len2));
9619 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009620 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009621
9622 u = PyUnicode_FromKindAndData(rkind, res, slen);
9623 PyMem_Free(res);
9624 if (!u) goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009625 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009626 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009627
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009628 Py_ssize_t n, i, j, ires;
9629 Py_ssize_t product, new_size;
9630 int rkind = skind;
9631 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009632
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009633 if (kind1 < rkind) {
9634 buf1 = _PyUnicode_AsKind(str1, rkind);
9635 if (!buf1) goto error;
9636 release1 = 1;
9637 }
Victor Stinnerc3cec782011-10-05 21:24:08 +02009638 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009639 if (n == 0)
9640 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009641 if (kind2 < rkind) {
9642 buf2 = _PyUnicode_AsKind(str2, rkind);
9643 if (!buf2) goto error;
9644 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009645 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009646 else if (kind2 > rkind) {
9647 rkind = kind2;
9648 sbuf = _PyUnicode_AsKind(self, rkind);
9649 if (!sbuf) goto error;
9650 srelease = 1;
9651 if (release1) PyMem_Free(buf1);
9652 buf1 = _PyUnicode_AsKind(str1, rkind);
9653 if (!buf1) goto error;
9654 release1 = 1;
9655 }
9656 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9657 PyUnicode_GET_LENGTH(str1))); */
9658 product = n * (len2-len1);
9659 if ((product / (len2-len1)) != n) {
9660 PyErr_SetString(PyExc_OverflowError,
9661 "replace string is too long");
9662 goto error;
9663 }
9664 new_size = slen + product;
9665 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9666 PyErr_SetString(PyExc_OverflowError,
9667 "replace string is too long");
9668 goto error;
9669 }
9670 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
9671 if (!res)
9672 goto error;
9673 ires = i = 0;
9674 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009675 while (n-- > 0) {
9676 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +02009677 j = anylib_find(rkind, self,
9678 sbuf + PyUnicode_KIND_SIZE(rkind, i), slen-i,
9679 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009680 if (j == -1)
9681 break;
9682 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009683 /* copy unchanged part [i:j] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009684 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9685 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9686 PyUnicode_KIND_SIZE(rkind, j-i));
9687 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009688 }
9689 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009690 if (len2 > 0) {
9691 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9692 buf2,
9693 PyUnicode_KIND_SIZE(rkind, len2));
9694 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009695 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009696 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009697 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009698 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009699 /* copy tail [i:] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009700 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9701 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9702 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009703 } else {
9704 /* interleave */
9705 while (n > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009706 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9707 buf2,
9708 PyUnicode_KIND_SIZE(rkind, len2));
9709 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009710 if (--n <= 0)
9711 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009712 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9713 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9714 PyUnicode_KIND_SIZE(rkind, 1));
9715 ires++;
9716 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009717 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009718 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9719 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9720 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009721 }
Victor Stinnerf48323e2011-10-05 23:27:08 +02009722 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(str2))
9723 u = unicode_fromascii((unsigned char*)res, new_size);
9724 else
9725 u = PyUnicode_FromKindAndData(rkind, res, new_size);
Martin v. Löwis0b1d3482011-10-01 16:35:40 +02009726 PyMem_Free(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009727 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009728 if (srelease)
9729 PyMem_FREE(sbuf);
9730 if (release1)
9731 PyMem_FREE(buf1);
9732 if (release2)
9733 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009734 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009735 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009736
Benjamin Peterson29060642009-01-31 22:14:21 +00009737 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009738 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009739 if (srelease)
9740 PyMem_FREE(sbuf);
9741 if (release1)
9742 PyMem_FREE(buf1);
9743 if (release2)
9744 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009745 if (PyUnicode_CheckExact(self)) {
9746 Py_INCREF(self);
9747 return (PyObject *) self;
9748 }
Victor Stinner034f6cf2011-09-30 02:26:44 +02009749 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009750 error:
9751 if (srelease && sbuf)
9752 PyMem_FREE(sbuf);
9753 if (release1 && buf1)
9754 PyMem_FREE(buf1);
9755 if (release2 && buf2)
9756 PyMem_FREE(buf2);
9757 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009758}
9759
9760/* --- Unicode Object Methods --------------------------------------------- */
9761
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009762PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009763 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009764\n\
9765Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009766characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009767
9768static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +02009769unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009770{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009771 return fixup(self, fixtitle);
9772}
9773
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009774PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009775 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009776\n\
9777Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009778have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009779
9780static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +02009781unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009782{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009783 return fixup(self, fixcapitalize);
9784}
9785
9786#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009787PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009788 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009789\n\
9790Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009791normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009792
9793static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009794unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009795{
9796 PyObject *list;
9797 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009798 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009799
Guido van Rossumd57fd912000-03-10 22:53:23 +00009800 /* Split into words */
9801 list = split(self, NULL, -1);
9802 if (!list)
9803 return NULL;
9804
9805 /* Capitalize each word */
9806 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9807 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009808 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009809 if (item == NULL)
9810 goto onError;
9811 Py_DECREF(PyList_GET_ITEM(list, i));
9812 PyList_SET_ITEM(list, i, item);
9813 }
9814
9815 /* Join the words to form a new string */
9816 item = PyUnicode_Join(NULL, list);
9817
Benjamin Peterson29060642009-01-31 22:14:21 +00009818 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009819 Py_DECREF(list);
9820 return (PyObject *)item;
9821}
9822#endif
9823
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009824/* Argument converter. Coerces to a single unicode character */
9825
9826static int
9827convert_uc(PyObject *obj, void *addr)
9828{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009829 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009830 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009831
Benjamin Peterson14339b62009-01-31 16:36:08 +00009832 uniobj = PyUnicode_FromObject(obj);
9833 if (uniobj == NULL) {
9834 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009835 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009836 return 0;
9837 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009838 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009839 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009840 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009841 Py_DECREF(uniobj);
9842 return 0;
9843 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009844 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009845 Py_DECREF(uniobj);
9846 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009847}
9848
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009849PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009850 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009851\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009852Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009853done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009854
9855static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009856unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009857{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009858 Py_ssize_t marg, left;
9859 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009860 Py_UCS4 fillchar = ' ';
9861
Victor Stinnere9a29352011-10-01 02:14:59 +02009862 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009863 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009864
Victor Stinnere9a29352011-10-01 02:14:59 +02009865 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009866 return NULL;
9867
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009868 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009869 Py_INCREF(self);
9870 return (PyObject*) self;
9871 }
9872
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009873 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009874 left = marg / 2 + (marg & width & 1);
9875
Victor Stinner9310abb2011-10-05 00:59:23 +02009876 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009877}
9878
Marc-André Lemburge5034372000-08-08 08:04:29 +00009879#if 0
9880
9881/* This code should go into some future Unicode collation support
9882 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00009883 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00009884
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009885/* speedy UTF-16 code point order comparison */
9886/* gleaned from: */
9887/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9888
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009889static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009890{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009891 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00009892 0, 0, 0, 0, 0, 0, 0, 0,
9893 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009894 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009895};
9896
Guido van Rossumd57fd912000-03-10 22:53:23 +00009897static int
9898unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9899{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009900 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009901
Guido van Rossumd57fd912000-03-10 22:53:23 +00009902 Py_UNICODE *s1 = str1->str;
9903 Py_UNICODE *s2 = str2->str;
9904
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009905 len1 = str1->_base._base.length;
9906 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +00009907
Guido van Rossumd57fd912000-03-10 22:53:23 +00009908 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00009909 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009910
9911 c1 = *s1++;
9912 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00009913
Benjamin Peterson29060642009-01-31 22:14:21 +00009914 if (c1 > (1<<11) * 26)
9915 c1 += utf16Fixup[c1>>11];
9916 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009917 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009918 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00009919
9920 if (c1 != c2)
9921 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00009922
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009923 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009924 }
9925
9926 return (len1 < len2) ? -1 : (len1 != len2);
9927}
9928
Marc-André Lemburge5034372000-08-08 08:04:29 +00009929#else
9930
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009931/* This function assumes that str1 and str2 are readied by the caller. */
9932
Marc-André Lemburge5034372000-08-08 08:04:29 +00009933static int
9934unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9935{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009936 int kind1, kind2;
9937 void *data1, *data2;
9938 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009939
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009940 kind1 = PyUnicode_KIND(str1);
9941 kind2 = PyUnicode_KIND(str2);
9942 data1 = PyUnicode_DATA(str1);
9943 data2 = PyUnicode_DATA(str2);
9944 len1 = PyUnicode_GET_LENGTH(str1);
9945 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +00009946
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009947 for (i = 0; i < len1 && i < len2; ++i) {
9948 Py_UCS4 c1, c2;
9949 c1 = PyUnicode_READ(kind1, data1, i);
9950 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +00009951
9952 if (c1 != c2)
9953 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009954 }
9955
9956 return (len1 < len2) ? -1 : (len1 != len2);
9957}
9958
9959#endif
9960
Alexander Belopolsky40018472011-02-26 01:02:56 +00009961int
9962PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009963{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009964 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9965 if (PyUnicode_READY(left) == -1 ||
9966 PyUnicode_READY(right) == -1)
9967 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009968 return unicode_compare((PyUnicodeObject *)left,
9969 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009970 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009971 PyErr_Format(PyExc_TypeError,
9972 "Can't compare %.100s and %.100s",
9973 left->ob_type->tp_name,
9974 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009975 return -1;
9976}
9977
Martin v. Löwis5b222132007-06-10 09:51:05 +00009978int
9979PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9980{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009981 Py_ssize_t i;
9982 int kind;
9983 void *data;
9984 Py_UCS4 chr;
9985
Victor Stinner910337b2011-10-03 03:20:16 +02009986 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009987 if (PyUnicode_READY(uni) == -1)
9988 return -1;
9989 kind = PyUnicode_KIND(uni);
9990 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009991 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009992 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9993 if (chr != str[i])
9994 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00009995 /* This check keeps Python strings that end in '\0' from comparing equal
9996 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009997 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +00009998 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009999 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010000 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010001 return 0;
10002}
10003
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010004
Benjamin Peterson29060642009-01-31 22:14:21 +000010005#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010006 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010007
Alexander Belopolsky40018472011-02-26 01:02:56 +000010008PyObject *
10009PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010010{
10011 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010012
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010013 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10014 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010015 if (PyUnicode_READY(left) == -1 ||
10016 PyUnicode_READY(right) == -1)
10017 return NULL;
10018 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10019 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010020 if (op == Py_EQ) {
10021 Py_INCREF(Py_False);
10022 return Py_False;
10023 }
10024 if (op == Py_NE) {
10025 Py_INCREF(Py_True);
10026 return Py_True;
10027 }
10028 }
10029 if (left == right)
10030 result = 0;
10031 else
10032 result = unicode_compare((PyUnicodeObject *)left,
10033 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010034
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010035 /* Convert the return value to a Boolean */
10036 switch (op) {
10037 case Py_EQ:
10038 v = TEST_COND(result == 0);
10039 break;
10040 case Py_NE:
10041 v = TEST_COND(result != 0);
10042 break;
10043 case Py_LE:
10044 v = TEST_COND(result <= 0);
10045 break;
10046 case Py_GE:
10047 v = TEST_COND(result >= 0);
10048 break;
10049 case Py_LT:
10050 v = TEST_COND(result == -1);
10051 break;
10052 case Py_GT:
10053 v = TEST_COND(result == 1);
10054 break;
10055 default:
10056 PyErr_BadArgument();
10057 return NULL;
10058 }
10059 Py_INCREF(v);
10060 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010061 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010062
Brian Curtindfc80e32011-08-10 20:28:54 -050010063 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010064}
10065
Alexander Belopolsky40018472011-02-26 01:02:56 +000010066int
10067PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010068{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010069 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010070 int kind1, kind2, kind;
10071 void *buf1, *buf2;
10072 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010073 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010074
10075 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010076 sub = PyUnicode_FromObject(element);
10077 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010078 PyErr_Format(PyExc_TypeError,
10079 "'in <string>' requires string as left operand, not %s",
10080 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010081 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010082 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010083 if (PyUnicode_READY(sub) == -1)
10084 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010085
Thomas Wouters477c8d52006-05-27 19:21:47 +000010086 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +020010087 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010088 Py_DECREF(sub);
10089 return -1;
10090 }
10091
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010092 kind1 = PyUnicode_KIND(str);
10093 kind2 = PyUnicode_KIND(sub);
10094 kind = kind1 > kind2 ? kind1 : kind2;
10095 buf1 = PyUnicode_DATA(str);
10096 buf2 = PyUnicode_DATA(sub);
10097 if (kind1 != kind)
10098 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
10099 if (!buf1) {
10100 Py_DECREF(sub);
10101 return -1;
10102 }
10103 if (kind2 != kind)
10104 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
10105 if (!buf2) {
10106 Py_DECREF(sub);
10107 if (kind1 != kind) PyMem_Free(buf1);
10108 return -1;
10109 }
10110 len1 = PyUnicode_GET_LENGTH(str);
10111 len2 = PyUnicode_GET_LENGTH(sub);
10112
10113 switch(kind) {
10114 case PyUnicode_1BYTE_KIND:
10115 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10116 break;
10117 case PyUnicode_2BYTE_KIND:
10118 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10119 break;
10120 case PyUnicode_4BYTE_KIND:
10121 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10122 break;
10123 default:
10124 result = -1;
10125 assert(0);
10126 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010127
10128 Py_DECREF(str);
10129 Py_DECREF(sub);
10130
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010131 if (kind1 != kind)
10132 PyMem_Free(buf1);
10133 if (kind2 != kind)
10134 PyMem_Free(buf2);
10135
Guido van Rossum403d68b2000-03-13 15:55:09 +000010136 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010137}
10138
Guido van Rossumd57fd912000-03-10 22:53:23 +000010139/* Concat to string or Unicode object giving a new Unicode object. */
10140
Alexander Belopolsky40018472011-02-26 01:02:56 +000010141PyObject *
10142PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010143{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010144 PyObject *u = NULL, *v = NULL, *w;
10145 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010146
10147 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010148 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010149 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010150 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010151 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010152 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010153 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010154
10155 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010156 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010157 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010158 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010159 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010160 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010161 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010162 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010163 }
10164
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010165 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +020010166 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010167
Guido van Rossumd57fd912000-03-10 22:53:23 +000010168 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010169 w = PyUnicode_New(
10170 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
10171 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010172 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010173 goto onError;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010174 copy_characters(w, 0, u, 0, PyUnicode_GET_LENGTH(u));
10175 copy_characters(w, PyUnicode_GET_LENGTH(u), v, 0, PyUnicode_GET_LENGTH(v));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010176 Py_DECREF(u);
10177 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010178 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010179 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010180
Benjamin Peterson29060642009-01-31 22:14:21 +000010181 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010182 Py_XDECREF(u);
10183 Py_XDECREF(v);
10184 return NULL;
10185}
10186
Victor Stinnerb0923652011-10-04 01:17:31 +020010187static void
10188unicode_append_inplace(PyObject **p_left, PyObject *right)
10189{
10190 Py_ssize_t left_len, right_len, new_len;
Victor Stinnerb0923652011-10-04 01:17:31 +020010191
10192 assert(PyUnicode_IS_READY(*p_left));
10193 assert(PyUnicode_IS_READY(right));
10194
10195 left_len = PyUnicode_GET_LENGTH(*p_left);
10196 right_len = PyUnicode_GET_LENGTH(right);
10197 if (left_len > PY_SSIZE_T_MAX - right_len) {
10198 PyErr_SetString(PyExc_OverflowError,
10199 "strings are too large to concat");
10200 goto error;
10201 }
10202 new_len = left_len + right_len;
10203
10204 /* Now we own the last reference to 'left', so we can resize it
10205 * in-place.
10206 */
10207 if (unicode_resize(p_left, new_len) != 0) {
10208 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10209 * deallocated so it cannot be put back into
10210 * 'variable'. The MemoryError is raised when there
10211 * is no value in 'variable', which might (very
10212 * remotely) be a cause of incompatibilities.
10213 */
10214 goto error;
10215 }
10216 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010217 copy_characters(*p_left, left_len, right, 0, right_len);
10218 _PyUnicode_DIRTY(*p_left);
Victor Stinnerb0923652011-10-04 01:17:31 +020010219 return;
10220
10221error:
10222 Py_DECREF(*p_left);
10223 *p_left = NULL;
10224}
10225
Walter Dörwald1ab83302007-05-18 17:15:44 +000010226void
Victor Stinner23e56682011-10-03 03:54:37 +020010227PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010228{
Victor Stinner23e56682011-10-03 03:54:37 +020010229 PyObject *left, *res;
10230
10231 if (p_left == NULL) {
10232 if (!PyErr_Occurred())
10233 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010234 return;
10235 }
Victor Stinner23e56682011-10-03 03:54:37 +020010236 left = *p_left;
10237 if (right == NULL || !PyUnicode_Check(left)) {
10238 if (!PyErr_Occurred())
10239 PyErr_BadInternalCall();
10240 goto error;
10241 }
10242
Victor Stinnere1335c72011-10-04 20:53:03 +020010243 if (PyUnicode_READY(left))
10244 goto error;
10245 if (PyUnicode_READY(right))
10246 goto error;
10247
Victor Stinner23e56682011-10-03 03:54:37 +020010248 if (PyUnicode_CheckExact(left) && left != unicode_empty
10249 && PyUnicode_CheckExact(right) && right != unicode_empty
10250 && unicode_resizable(left)
10251 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
10252 || _PyUnicode_WSTR(left) != NULL))
10253 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010254 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10255 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010256 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010257 not so different than duplicating the string. */
10258 if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
Victor Stinner23e56682011-10-03 03:54:37 +020010259 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010260 unicode_append_inplace(p_left, right);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010261 if (p_left != NULL)
10262 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010263 return;
10264 }
10265 }
10266
10267 res = PyUnicode_Concat(left, right);
10268 if (res == NULL)
10269 goto error;
10270 Py_DECREF(left);
10271 *p_left = res;
10272 return;
10273
10274error:
10275 Py_DECREF(*p_left);
10276 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +000010277}
10278
10279void
10280PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10281{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010282 PyUnicode_Append(pleft, right);
10283 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010284}
10285
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010286PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010287 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010288\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010289Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010290string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010291interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010292
10293static PyObject *
10294unicode_count(PyUnicodeObject *self, PyObject *args)
10295{
10296 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010297 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010298 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010299 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010300 int kind1, kind2, kind;
10301 void *buf1, *buf2;
10302 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010303
Jesus Ceaac451502011-04-20 17:09:23 +020010304 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10305 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010306 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010307
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010308 kind1 = PyUnicode_KIND(self);
10309 kind2 = PyUnicode_KIND(substring);
10310 kind = kind1 > kind2 ? kind1 : kind2;
10311 buf1 = PyUnicode_DATA(self);
10312 buf2 = PyUnicode_DATA(substring);
10313 if (kind1 != kind)
10314 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
10315 if (!buf1) {
10316 Py_DECREF(substring);
10317 return NULL;
10318 }
10319 if (kind2 != kind)
10320 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
10321 if (!buf2) {
10322 Py_DECREF(substring);
10323 if (kind1 != kind) PyMem_Free(buf1);
10324 return NULL;
10325 }
10326 len1 = PyUnicode_GET_LENGTH(self);
10327 len2 = PyUnicode_GET_LENGTH(substring);
10328
10329 ADJUST_INDICES(start, end, len1);
10330 switch(kind) {
10331 case PyUnicode_1BYTE_KIND:
10332 iresult = ucs1lib_count(
10333 ((Py_UCS1*)buf1) + start, end - start,
10334 buf2, len2, PY_SSIZE_T_MAX
10335 );
10336 break;
10337 case PyUnicode_2BYTE_KIND:
10338 iresult = ucs2lib_count(
10339 ((Py_UCS2*)buf1) + start, end - start,
10340 buf2, len2, PY_SSIZE_T_MAX
10341 );
10342 break;
10343 case PyUnicode_4BYTE_KIND:
10344 iresult = ucs4lib_count(
10345 ((Py_UCS4*)buf1) + start, end - start,
10346 buf2, len2, PY_SSIZE_T_MAX
10347 );
10348 break;
10349 default:
10350 assert(0); iresult = 0;
10351 }
10352
10353 result = PyLong_FromSsize_t(iresult);
10354
10355 if (kind1 != kind)
10356 PyMem_Free(buf1);
10357 if (kind2 != kind)
10358 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010359
10360 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010361
Guido van Rossumd57fd912000-03-10 22:53:23 +000010362 return result;
10363}
10364
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010365PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010366 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010367\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010368Encode S using the codec registered for encoding. Default encoding\n\
10369is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010370handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010371a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10372'xmlcharrefreplace' as well as any other name registered with\n\
10373codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010374
10375static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +000010376unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010377{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010378 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010379 char *encoding = NULL;
10380 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010381
Benjamin Peterson308d6372009-09-18 21:42:35 +000010382 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10383 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010384 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +000010385 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010386}
10387
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010388PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010389 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010390\n\
10391Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010392If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010393
10394static PyObject*
10395unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
10396{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010397 Py_ssize_t i, j, line_pos, src_len, incr;
10398 Py_UCS4 ch;
10399 PyObject *u;
10400 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010401 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010402 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010403 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010404
10405 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010406 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010407
Antoine Pitrou22425222011-10-04 19:10:51 +020010408 if (PyUnicode_READY(self) == -1)
10409 return NULL;
10410
Thomas Wouters7e474022000-07-16 12:04:32 +000010411 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010412 src_len = PyUnicode_GET_LENGTH(self);
10413 i = j = line_pos = 0;
10414 kind = PyUnicode_KIND(self);
10415 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010416 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010417 for (; i < src_len; i++) {
10418 ch = PyUnicode_READ(kind, src_data, i);
10419 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010420 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010421 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010422 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010423 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010424 goto overflow;
10425 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010426 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010427 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010428 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010429 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010430 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010431 goto overflow;
10432 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010433 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010434 if (ch == '\n' || ch == '\r')
10435 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010436 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010437 }
Antoine Pitroue19aa382011-10-04 16:04:01 +020010438 if (!found && PyUnicode_CheckExact(self)) {
10439 Py_INCREF((PyObject *) self);
10440 return (PyObject *) self;
10441 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010442
Guido van Rossumd57fd912000-03-10 22:53:23 +000010443 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010444 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010445 if (!u)
10446 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010447 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010448
Antoine Pitroue71d5742011-10-04 15:55:09 +020010449 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010450
Antoine Pitroue71d5742011-10-04 15:55:09 +020010451 for (; i < src_len; i++) {
10452 ch = PyUnicode_READ(kind, src_data, i);
10453 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010454 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010455 incr = tabsize - (line_pos % tabsize);
10456 line_pos += incr;
10457 while (incr--) {
10458 PyUnicode_WRITE(kind, dest_data, j, ' ');
10459 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010460 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010461 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010462 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010463 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010464 line_pos++;
10465 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010466 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010467 if (ch == '\n' || ch == '\r')
10468 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010469 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010470 }
10471 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinner17efeed2011-10-04 20:05:46 +020010472#ifndef DONT_MAKE_RESULT_READY
10473 if (_PyUnicode_READY_REPLACE(&u)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010474 Py_DECREF(u);
10475 return NULL;
10476 }
Victor Stinner17efeed2011-10-04 20:05:46 +020010477#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010478 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010479 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010480
Antoine Pitroue71d5742011-10-04 15:55:09 +020010481 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010482 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10483 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010484}
10485
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010486PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010487 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010488\n\
10489Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010490such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010491arguments start and end are interpreted as in slice notation.\n\
10492\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010493Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010494
10495static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010496unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010497{
Jesus Ceaac451502011-04-20 17:09:23 +020010498 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010499 Py_ssize_t start;
10500 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010501 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010502
Jesus Ceaac451502011-04-20 17:09:23 +020010503 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10504 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010505 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010506
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010507 if (PyUnicode_READY(self) == -1)
10508 return NULL;
10509 if (PyUnicode_READY(substring) == -1)
10510 return NULL;
10511
10512 result = any_find_slice(
Victor Stinnerc3cec782011-10-05 21:24:08 +020010513 asciilib_find_slice, ucs1lib_find_slice,
10514 ucs2lib_find_slice, ucs4lib_find_slice,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010515 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010516 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010517
10518 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010519
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010520 if (result == -2)
10521 return NULL;
10522
Christian Heimes217cfd12007-12-02 14:31:20 +000010523 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010524}
10525
10526static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010527unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010528{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010529 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10530 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010531 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010532 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010533}
10534
Guido van Rossumc2504932007-09-18 19:42:40 +000010535/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010536 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010537static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +000010538unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010539{
Guido van Rossumc2504932007-09-18 19:42:40 +000010540 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010541 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010542
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010543 if (_PyUnicode_HASH(self) != -1)
10544 return _PyUnicode_HASH(self);
10545 if (PyUnicode_READY(self) == -1)
10546 return -1;
10547 len = PyUnicode_GET_LENGTH(self);
10548
10549 /* The hash function as a macro, gets expanded three times below. */
10550#define HASH(P) \
10551 x = (Py_uhash_t)*P << 7; \
10552 while (--len >= 0) \
10553 x = (1000003*x) ^ (Py_uhash_t)*P++;
10554
10555 switch (PyUnicode_KIND(self)) {
10556 case PyUnicode_1BYTE_KIND: {
10557 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10558 HASH(c);
10559 break;
10560 }
10561 case PyUnicode_2BYTE_KIND: {
10562 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10563 HASH(s);
10564 break;
10565 }
10566 default: {
10567 Py_UCS4 *l;
10568 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10569 "Impossible switch case in unicode_hash");
10570 l = PyUnicode_4BYTE_DATA(self);
10571 HASH(l);
10572 break;
10573 }
10574 }
10575 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
10576
Guido van Rossumc2504932007-09-18 19:42:40 +000010577 if (x == -1)
10578 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010579 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010580 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010581}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010582#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000010583
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010584PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010585 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010586\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010587Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010588
10589static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010590unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010591{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010592 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +020010593 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010594 Py_ssize_t start;
10595 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010596
Jesus Ceaac451502011-04-20 17:09:23 +020010597 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10598 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010599 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010600
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010601 if (PyUnicode_READY(self) == -1)
10602 return NULL;
10603 if (PyUnicode_READY(substring) == -1)
10604 return NULL;
10605
10606 result = any_find_slice(
Victor Stinnerc3cec782011-10-05 21:24:08 +020010607 asciilib_find_slice, ucs1lib_find_slice,
10608 ucs2lib_find_slice, ucs4lib_find_slice,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010609 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010610 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010611
10612 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010613
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010614 if (result == -2)
10615 return NULL;
10616
Guido van Rossumd57fd912000-03-10 22:53:23 +000010617 if (result < 0) {
10618 PyErr_SetString(PyExc_ValueError, "substring not found");
10619 return NULL;
10620 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010621
Christian Heimes217cfd12007-12-02 14:31:20 +000010622 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010623}
10624
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010625PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010626 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010627\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010628Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010629at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010630
10631static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010632unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010633{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010634 Py_ssize_t i, length;
10635 int kind;
10636 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010637 int cased;
10638
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010639 if (PyUnicode_READY(self) == -1)
10640 return NULL;
10641 length = PyUnicode_GET_LENGTH(self);
10642 kind = PyUnicode_KIND(self);
10643 data = PyUnicode_DATA(self);
10644
Guido van Rossumd57fd912000-03-10 22:53:23 +000010645 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010646 if (length == 1)
10647 return PyBool_FromLong(
10648 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010649
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010650 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010651 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010652 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010653
Guido van Rossumd57fd912000-03-10 22:53:23 +000010654 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010655 for (i = 0; i < length; i++) {
10656 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010657
Benjamin Peterson29060642009-01-31 22:14:21 +000010658 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
10659 return PyBool_FromLong(0);
10660 else if (!cased && Py_UNICODE_ISLOWER(ch))
10661 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010662 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010663 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010664}
10665
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010666PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010667 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010668\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010669Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010670at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010671
10672static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010673unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010674{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010675 Py_ssize_t i, length;
10676 int kind;
10677 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010678 int cased;
10679
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010680 if (PyUnicode_READY(self) == -1)
10681 return NULL;
10682 length = PyUnicode_GET_LENGTH(self);
10683 kind = PyUnicode_KIND(self);
10684 data = PyUnicode_DATA(self);
10685
Guido van Rossumd57fd912000-03-10 22:53:23 +000010686 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010687 if (length == 1)
10688 return PyBool_FromLong(
10689 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010690
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010691 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010692 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010693 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010694
Guido van Rossumd57fd912000-03-10 22:53:23 +000010695 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010696 for (i = 0; i < length; i++) {
10697 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010698
Benjamin Peterson29060642009-01-31 22:14:21 +000010699 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
10700 return PyBool_FromLong(0);
10701 else if (!cased && Py_UNICODE_ISUPPER(ch))
10702 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010703 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010704 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010705}
10706
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010707PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010708 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010709\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010710Return True if S is a titlecased string and there is at least one\n\
10711character in S, i.e. upper- and titlecase characters may only\n\
10712follow uncased characters and lowercase characters only cased ones.\n\
10713Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010714
10715static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010716unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010717{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010718 Py_ssize_t i, length;
10719 int kind;
10720 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010721 int cased, previous_is_cased;
10722
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010723 if (PyUnicode_READY(self) == -1)
10724 return NULL;
10725 length = PyUnicode_GET_LENGTH(self);
10726 kind = PyUnicode_KIND(self);
10727 data = PyUnicode_DATA(self);
10728
Guido van Rossumd57fd912000-03-10 22:53:23 +000010729 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010730 if (length == 1) {
10731 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10732 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10733 (Py_UNICODE_ISUPPER(ch) != 0));
10734 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010735
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010736 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010737 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010738 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010739
Guido van Rossumd57fd912000-03-10 22:53:23 +000010740 cased = 0;
10741 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010742 for (i = 0; i < length; i++) {
10743 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010744
Benjamin Peterson29060642009-01-31 22:14:21 +000010745 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10746 if (previous_is_cased)
10747 return PyBool_FromLong(0);
10748 previous_is_cased = 1;
10749 cased = 1;
10750 }
10751 else if (Py_UNICODE_ISLOWER(ch)) {
10752 if (!previous_is_cased)
10753 return PyBool_FromLong(0);
10754 previous_is_cased = 1;
10755 cased = 1;
10756 }
10757 else
10758 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010759 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010760 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010761}
10762
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010763PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010764 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010765\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010766Return True if all characters in S are whitespace\n\
10767and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010768
10769static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010770unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010771{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010772 Py_ssize_t i, length;
10773 int kind;
10774 void *data;
10775
10776 if (PyUnicode_READY(self) == -1)
10777 return NULL;
10778 length = PyUnicode_GET_LENGTH(self);
10779 kind = PyUnicode_KIND(self);
10780 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010781
Guido van Rossumd57fd912000-03-10 22:53:23 +000010782 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010783 if (length == 1)
10784 return PyBool_FromLong(
10785 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010786
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010787 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010788 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010789 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010790
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010791 for (i = 0; i < length; i++) {
10792 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010793 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010794 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010795 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010796 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010797}
10798
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010799PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010800 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010801\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010802Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010803and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010804
10805static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010806unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010807{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010808 Py_ssize_t i, length;
10809 int kind;
10810 void *data;
10811
10812 if (PyUnicode_READY(self) == -1)
10813 return NULL;
10814 length = PyUnicode_GET_LENGTH(self);
10815 kind = PyUnicode_KIND(self);
10816 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010817
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010818 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010819 if (length == 1)
10820 return PyBool_FromLong(
10821 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010822
10823 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010824 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010825 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010826
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010827 for (i = 0; i < length; i++) {
10828 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010829 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010830 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010831 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010832}
10833
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010834PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010835 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010836\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010837Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010838and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010839
10840static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010841unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010842{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010843 int kind;
10844 void *data;
10845 Py_ssize_t len, i;
10846
10847 if (PyUnicode_READY(self) == -1)
10848 return NULL;
10849
10850 kind = PyUnicode_KIND(self);
10851 data = PyUnicode_DATA(self);
10852 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010853
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010854 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010855 if (len == 1) {
10856 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10857 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10858 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010859
10860 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010861 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010862 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010863
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010864 for (i = 0; i < len; i++) {
10865 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010866 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010867 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010868 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010869 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010870}
10871
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010872PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010873 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010874\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010875Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010876False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010877
10878static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010879unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010880{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010881 Py_ssize_t i, length;
10882 int kind;
10883 void *data;
10884
10885 if (PyUnicode_READY(self) == -1)
10886 return NULL;
10887 length = PyUnicode_GET_LENGTH(self);
10888 kind = PyUnicode_KIND(self);
10889 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010890
Guido van Rossumd57fd912000-03-10 22:53:23 +000010891 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010892 if (length == 1)
10893 return PyBool_FromLong(
10894 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010895
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010896 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010897 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010898 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010899
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010900 for (i = 0; i < length; i++) {
10901 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010902 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010903 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010904 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010905}
10906
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010907PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010908 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010909\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010910Return True if all characters in S are digits\n\
10911and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010912
10913static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010914unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010915{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010916 Py_ssize_t i, length;
10917 int kind;
10918 void *data;
10919
10920 if (PyUnicode_READY(self) == -1)
10921 return NULL;
10922 length = PyUnicode_GET_LENGTH(self);
10923 kind = PyUnicode_KIND(self);
10924 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010925
Guido van Rossumd57fd912000-03-10 22:53:23 +000010926 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010927 if (length == 1) {
10928 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10929 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10930 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010931
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010932 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010933 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010934 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010935
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010936 for (i = 0; i < length; i++) {
10937 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010938 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010939 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010940 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010941}
10942
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010943PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010944 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010945\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010946Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010947False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010948
10949static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010950unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010951{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010952 Py_ssize_t i, length;
10953 int kind;
10954 void *data;
10955
10956 if (PyUnicode_READY(self) == -1)
10957 return NULL;
10958 length = PyUnicode_GET_LENGTH(self);
10959 kind = PyUnicode_KIND(self);
10960 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010961
Guido van Rossumd57fd912000-03-10 22:53:23 +000010962 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010963 if (length == 1)
10964 return PyBool_FromLong(
10965 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010966
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010967 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010968 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010969 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010970
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010971 for (i = 0; i < length; i++) {
10972 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010973 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010974 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010975 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010976}
10977
Martin v. Löwis47383402007-08-15 07:32:56 +000010978int
10979PyUnicode_IsIdentifier(PyObject *self)
10980{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010981 int kind;
10982 void *data;
10983 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010984 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000010985
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010986 if (PyUnicode_READY(self) == -1) {
10987 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000010988 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010989 }
10990
10991 /* Special case for empty strings */
10992 if (PyUnicode_GET_LENGTH(self) == 0)
10993 return 0;
10994 kind = PyUnicode_KIND(self);
10995 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000010996
10997 /* PEP 3131 says that the first character must be in
10998 XID_Start and subsequent characters in XID_Continue,
10999 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011000 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011001 letters, digits, underscore). However, given the current
11002 definition of XID_Start and XID_Continue, it is sufficient
11003 to check just for these, except that _ must be allowed
11004 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011005 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011006 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011007 return 0;
11008
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011009 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011010 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011011 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011012 return 1;
11013}
11014
11015PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011016 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011017\n\
11018Return True if S is a valid identifier according\n\
11019to the language definition.");
11020
11021static PyObject*
11022unicode_isidentifier(PyObject *self)
11023{
11024 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11025}
11026
Georg Brandl559e5d72008-06-11 18:37:52 +000011027PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011028 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011029\n\
11030Return True if all characters in S are considered\n\
11031printable in repr() or S is empty, False otherwise.");
11032
11033static PyObject*
11034unicode_isprintable(PyObject *self)
11035{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011036 Py_ssize_t i, length;
11037 int kind;
11038 void *data;
11039
11040 if (PyUnicode_READY(self) == -1)
11041 return NULL;
11042 length = PyUnicode_GET_LENGTH(self);
11043 kind = PyUnicode_KIND(self);
11044 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011045
11046 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011047 if (length == 1)
11048 return PyBool_FromLong(
11049 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011050
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011051 for (i = 0; i < length; i++) {
11052 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011053 Py_RETURN_FALSE;
11054 }
11055 }
11056 Py_RETURN_TRUE;
11057}
11058
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011059PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011060 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011061\n\
11062Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011063iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011064
11065static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011066unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011067{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011068 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011069}
11070
Martin v. Löwis18e16552006-02-15 17:27:45 +000011071static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000011072unicode_length(PyUnicodeObject *self)
11073{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011074 if (PyUnicode_READY(self) == -1)
11075 return -1;
11076 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011077}
11078
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011079PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011080 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011081\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011082Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011083done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011084
11085static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011086unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011087{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011088 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011089 Py_UCS4 fillchar = ' ';
11090
11091 if (PyUnicode_READY(self) == -1)
11092 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011093
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011094 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011095 return NULL;
11096
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011097 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011098 Py_INCREF(self);
11099 return (PyObject*) self;
11100 }
11101
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011102 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011103}
11104
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011105PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011106 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011107\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011108Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011109
11110static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011111unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011112{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011113 return fixup(self, fixlower);
11114}
11115
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011116#define LEFTSTRIP 0
11117#define RIGHTSTRIP 1
11118#define BOTHSTRIP 2
11119
11120/* Arrays indexed by above */
11121static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11122
11123#define STRIPNAME(i) (stripformat[i]+3)
11124
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011125/* externally visible for str.strip(unicode) */
11126PyObject *
11127_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
11128{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011129 void *data;
11130 int kind;
11131 Py_ssize_t i, j, len;
11132 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011133
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011134 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11135 return NULL;
11136
11137 kind = PyUnicode_KIND(self);
11138 data = PyUnicode_DATA(self);
11139 len = PyUnicode_GET_LENGTH(self);
11140 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11141 PyUnicode_DATA(sepobj),
11142 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011143
Benjamin Peterson14339b62009-01-31 16:36:08 +000011144 i = 0;
11145 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011146 while (i < len &&
11147 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011148 i++;
11149 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011150 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011151
Benjamin Peterson14339b62009-01-31 16:36:08 +000011152 j = len;
11153 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011154 do {
11155 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011156 } while (j >= i &&
11157 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011158 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011159 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011160
Victor Stinner12bab6d2011-10-01 01:53:49 +020011161 return PyUnicode_Substring((PyObject*)self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011162}
11163
11164PyObject*
11165PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11166{
11167 unsigned char *data;
11168 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011169 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011170
Victor Stinnerde636f32011-10-01 03:55:54 +020011171 if (PyUnicode_READY(self) == -1)
11172 return NULL;
11173
11174 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11175
Victor Stinner12bab6d2011-10-01 01:53:49 +020011176 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011177 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011178 if (PyUnicode_CheckExact(self)) {
11179 Py_INCREF(self);
11180 return self;
11181 }
11182 else
11183 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011184 }
11185
Victor Stinner12bab6d2011-10-01 01:53:49 +020011186 length = end - start;
11187 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011188 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011189
Victor Stinnerde636f32011-10-01 03:55:54 +020011190 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011191 PyErr_SetString(PyExc_IndexError, "string index out of range");
11192 return NULL;
11193 }
11194
Victor Stinnerb9275c12011-10-05 14:01:42 +020011195 if (PyUnicode_IS_ASCII(self)) {
11196 kind = PyUnicode_KIND(self);
11197 data = PyUnicode_1BYTE_DATA(self);
11198 return unicode_fromascii(data + start, length);
11199 }
11200 else {
11201 kind = PyUnicode_KIND(self);
11202 data = PyUnicode_1BYTE_DATA(self);
11203 return PyUnicode_FromKindAndData(kind,
11204 data + PyUnicode_KIND_SIZE(kind, start),
11205 length);
11206 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011207}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011208
11209static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011210do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011211{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011212 int kind;
11213 void *data;
11214 Py_ssize_t len, i, j;
11215
11216 if (PyUnicode_READY(self) == -1)
11217 return NULL;
11218
11219 kind = PyUnicode_KIND(self);
11220 data = PyUnicode_DATA(self);
11221 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011222
Benjamin Peterson14339b62009-01-31 16:36:08 +000011223 i = 0;
11224 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011225 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011226 i++;
11227 }
11228 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011229
Benjamin Peterson14339b62009-01-31 16:36:08 +000011230 j = len;
11231 if (striptype != LEFTSTRIP) {
11232 do {
11233 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011234 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011235 j++;
11236 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011237
Victor Stinner12bab6d2011-10-01 01:53:49 +020011238 return PyUnicode_Substring((PyObject*)self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011239}
11240
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011241
11242static PyObject *
11243do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
11244{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011245 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011246
Benjamin Peterson14339b62009-01-31 16:36:08 +000011247 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11248 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011249
Benjamin Peterson14339b62009-01-31 16:36:08 +000011250 if (sep != NULL && sep != Py_None) {
11251 if (PyUnicode_Check(sep))
11252 return _PyUnicode_XStrip(self, striptype, sep);
11253 else {
11254 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011255 "%s arg must be None or str",
11256 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011257 return NULL;
11258 }
11259 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011260
Benjamin Peterson14339b62009-01-31 16:36:08 +000011261 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011262}
11263
11264
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011265PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011266 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011267\n\
11268Return a copy of the string S with leading and trailing\n\
11269whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011270If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011271
11272static PyObject *
11273unicode_strip(PyUnicodeObject *self, PyObject *args)
11274{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011275 if (PyTuple_GET_SIZE(args) == 0)
11276 return do_strip(self, BOTHSTRIP); /* Common case */
11277 else
11278 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011279}
11280
11281
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011282PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011283 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011284\n\
11285Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011286If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011287
11288static PyObject *
11289unicode_lstrip(PyUnicodeObject *self, PyObject *args)
11290{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011291 if (PyTuple_GET_SIZE(args) == 0)
11292 return do_strip(self, LEFTSTRIP); /* Common case */
11293 else
11294 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011295}
11296
11297
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011298PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011299 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011300\n\
11301Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011302If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011303
11304static PyObject *
11305unicode_rstrip(PyUnicodeObject *self, PyObject *args)
11306{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011307 if (PyTuple_GET_SIZE(args) == 0)
11308 return do_strip(self, RIGHTSTRIP); /* Common case */
11309 else
11310 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011311}
11312
11313
Guido van Rossumd57fd912000-03-10 22:53:23 +000011314static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000011315unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011316{
11317 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011318 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011319
Georg Brandl222de0f2009-04-12 12:01:50 +000011320 if (len < 1) {
11321 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011322 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011323 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011324
Tim Peters7a29bd52001-09-12 03:03:31 +000011325 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011326 /* no repeat, return original string */
11327 Py_INCREF(str);
11328 return (PyObject*) str;
11329 }
Tim Peters8f422462000-09-09 06:13:41 +000011330
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011331 if (PyUnicode_READY(str) == -1)
11332 return NULL;
11333
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011334 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011335 PyErr_SetString(PyExc_OverflowError,
11336 "repeated string is too long");
11337 return NULL;
11338 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011339 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011340
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011341 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011342 if (!u)
11343 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011344 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011345
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011346 if (PyUnicode_GET_LENGTH(str) == 1) {
11347 const int kind = PyUnicode_KIND(str);
11348 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11349 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011350 if (kind == PyUnicode_1BYTE_KIND)
11351 memset(to, (unsigned char)fill_char, len);
11352 else {
11353 for (n = 0; n < len; ++n)
11354 PyUnicode_WRITE(kind, to, n, fill_char);
11355 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011356 }
11357 else {
11358 /* number of characters copied this far */
11359 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
11360 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
11361 char *to = (char *) PyUnicode_DATA(u);
11362 Py_MEMCPY(to, PyUnicode_DATA(str),
11363 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011364 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011365 n = (done <= nchars-done) ? done : nchars-done;
11366 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011367 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011368 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011369 }
11370
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011371 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011372 return (PyObject*) u;
11373}
11374
Alexander Belopolsky40018472011-02-26 01:02:56 +000011375PyObject *
11376PyUnicode_Replace(PyObject *obj,
11377 PyObject *subobj,
11378 PyObject *replobj,
11379 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011380{
11381 PyObject *self;
11382 PyObject *str1;
11383 PyObject *str2;
11384 PyObject *result;
11385
11386 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011387 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011388 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011389 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011390 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011391 Py_DECREF(self);
11392 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011393 }
11394 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011395 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011396 Py_DECREF(self);
11397 Py_DECREF(str1);
11398 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011399 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011400 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011401 Py_DECREF(self);
11402 Py_DECREF(str1);
11403 Py_DECREF(str2);
11404 return result;
11405}
11406
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011407PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011408 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011409\n\
11410Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011411old replaced by new. If the optional argument count is\n\
11412given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011413
11414static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011415unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011416{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011417 PyObject *str1;
11418 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011419 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011420 PyObject *result;
11421
Martin v. Löwis18e16552006-02-15 17:27:45 +000011422 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011423 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011424 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011425 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011426 str1 = PyUnicode_FromObject(str1);
11427 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11428 return NULL;
11429 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011430 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011431 Py_DECREF(str1);
11432 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011433 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011434
11435 result = replace(self, str1, str2, maxcount);
11436
11437 Py_DECREF(str1);
11438 Py_DECREF(str2);
11439 return result;
11440}
11441
Alexander Belopolsky40018472011-02-26 01:02:56 +000011442static PyObject *
11443unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011444{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011445 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011446 Py_ssize_t isize;
11447 Py_ssize_t osize, squote, dquote, i, o;
11448 Py_UCS4 max, quote;
11449 int ikind, okind;
11450 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011451
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011452 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011453 return NULL;
11454
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011455 isize = PyUnicode_GET_LENGTH(unicode);
11456 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011457
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011458 /* Compute length of output, quote characters, and
11459 maximum character */
11460 osize = 2; /* quotes */
11461 max = 127;
11462 squote = dquote = 0;
11463 ikind = PyUnicode_KIND(unicode);
11464 for (i = 0; i < isize; i++) {
11465 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11466 switch (ch) {
11467 case '\'': squote++; osize++; break;
11468 case '"': dquote++; osize++; break;
11469 case '\\': case '\t': case '\r': case '\n':
11470 osize += 2; break;
11471 default:
11472 /* Fast-path ASCII */
11473 if (ch < ' ' || ch == 0x7f)
11474 osize += 4; /* \xHH */
11475 else if (ch < 0x7f)
11476 osize++;
11477 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11478 osize++;
11479 max = ch > max ? ch : max;
11480 }
11481 else if (ch < 0x100)
11482 osize += 4; /* \xHH */
11483 else if (ch < 0x10000)
11484 osize += 6; /* \uHHHH */
11485 else
11486 osize += 10; /* \uHHHHHHHH */
11487 }
11488 }
11489
11490 quote = '\'';
11491 if (squote) {
11492 if (dquote)
11493 /* Both squote and dquote present. Use squote,
11494 and escape them */
11495 osize += squote;
11496 else
11497 quote = '"';
11498 }
11499
11500 repr = PyUnicode_New(osize, max);
11501 if (repr == NULL)
11502 return NULL;
11503 okind = PyUnicode_KIND(repr);
11504 odata = PyUnicode_DATA(repr);
11505
11506 PyUnicode_WRITE(okind, odata, 0, quote);
11507 PyUnicode_WRITE(okind, odata, osize-1, quote);
11508
11509 for (i = 0, o = 1; i < isize; i++) {
11510 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011511
11512 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011513 if ((ch == quote) || (ch == '\\')) {
11514 PyUnicode_WRITE(okind, odata, o++, '\\');
11515 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011516 continue;
11517 }
11518
Benjamin Peterson29060642009-01-31 22:14:21 +000011519 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011520 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011521 PyUnicode_WRITE(okind, odata, o++, '\\');
11522 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011523 }
11524 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011525 PyUnicode_WRITE(okind, odata, o++, '\\');
11526 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011527 }
11528 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011529 PyUnicode_WRITE(okind, odata, o++, '\\');
11530 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011531 }
11532
11533 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011534 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011535 PyUnicode_WRITE(okind, odata, o++, '\\');
11536 PyUnicode_WRITE(okind, odata, o++, 'x');
11537 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11538 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011539 }
11540
Georg Brandl559e5d72008-06-11 18:37:52 +000011541 /* Copy ASCII characters as-is */
11542 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011543 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011544 }
11545
Benjamin Peterson29060642009-01-31 22:14:21 +000011546 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011547 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011548 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011549 (categories Z* and C* except ASCII space)
11550 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011551 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011552 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011553 if (ch <= 0xff) {
11554 PyUnicode_WRITE(okind, odata, o++, '\\');
11555 PyUnicode_WRITE(okind, odata, o++, 'x');
11556 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11557 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011558 }
11559 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011560 else if (ch >= 0x10000) {
11561 PyUnicode_WRITE(okind, odata, o++, '\\');
11562 PyUnicode_WRITE(okind, odata, o++, 'U');
11563 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
11564 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
11565 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
11566 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
11567 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11568 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11569 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11570 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011571 }
11572 /* Map 16-bit characters to '\uxxxx' */
11573 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011574 PyUnicode_WRITE(okind, odata, o++, '\\');
11575 PyUnicode_WRITE(okind, odata, o++, 'u');
11576 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11577 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11578 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11579 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011580 }
11581 }
11582 /* Copy characters as-is */
11583 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011584 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011585 }
11586 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000011587 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011588 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020011589 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000011590 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011591}
11592
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011593PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011594 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011595\n\
11596Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011597such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011598arguments start and end are interpreted as in slice notation.\n\
11599\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011600Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011601
11602static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011603unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011604{
Jesus Ceaac451502011-04-20 17:09:23 +020011605 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011606 Py_ssize_t start;
11607 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011608 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011609
Jesus Ceaac451502011-04-20 17:09:23 +020011610 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11611 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011612 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011613
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011614 if (PyUnicode_READY(self) == -1)
11615 return NULL;
11616 if (PyUnicode_READY(substring) == -1)
11617 return NULL;
11618
11619 result = any_find_slice(
Victor Stinnerc3cec782011-10-05 21:24:08 +020011620 asciilib_rfind_slice, ucs1lib_rfind_slice,
11621 ucs2lib_rfind_slice, ucs4lib_rfind_slice,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011622 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011623 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011624
11625 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011626
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011627 if (result == -2)
11628 return NULL;
11629
Christian Heimes217cfd12007-12-02 14:31:20 +000011630 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011631}
11632
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011633PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011634 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011635\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011636Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011637
11638static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011639unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011640{
Jesus Ceaac451502011-04-20 17:09:23 +020011641 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011642 Py_ssize_t start;
11643 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011644 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011645
Jesus Ceaac451502011-04-20 17:09:23 +020011646 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
11647 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011648 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011649
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011650 if (PyUnicode_READY(self) == -1)
11651 return NULL;
11652 if (PyUnicode_READY(substring) == -1)
11653 return NULL;
11654
11655 result = any_find_slice(
Victor Stinnerc3cec782011-10-05 21:24:08 +020011656 asciilib_rfind_slice, ucs1lib_rfind_slice,
11657 ucs2lib_rfind_slice, ucs4lib_rfind_slice,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011658 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011659 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011660
11661 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011662
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011663 if (result == -2)
11664 return NULL;
11665
Guido van Rossumd57fd912000-03-10 22:53:23 +000011666 if (result < 0) {
11667 PyErr_SetString(PyExc_ValueError, "substring not found");
11668 return NULL;
11669 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011670
Christian Heimes217cfd12007-12-02 14:31:20 +000011671 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011672}
11673
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011674PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011675 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011676\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011677Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011678done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011679
11680static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011681unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011682{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011683 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011684 Py_UCS4 fillchar = ' ';
11685
Victor Stinnere9a29352011-10-01 02:14:59 +020011686 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011687 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011688
Victor Stinnere9a29352011-10-01 02:14:59 +020011689 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011690 return NULL;
11691
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011692 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011693 Py_INCREF(self);
11694 return (PyObject*) self;
11695 }
11696
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011697 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011698}
11699
Alexander Belopolsky40018472011-02-26 01:02:56 +000011700PyObject *
11701PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011702{
11703 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000011704
Guido van Rossumd57fd912000-03-10 22:53:23 +000011705 s = PyUnicode_FromObject(s);
11706 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011707 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011708 if (sep != NULL) {
11709 sep = PyUnicode_FromObject(sep);
11710 if (sep == NULL) {
11711 Py_DECREF(s);
11712 return NULL;
11713 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011714 }
11715
Victor Stinner9310abb2011-10-05 00:59:23 +020011716 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011717
11718 Py_DECREF(s);
11719 Py_XDECREF(sep);
11720 return result;
11721}
11722
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011723PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011724 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011725\n\
11726Return a list of the words in S, using sep as the\n\
11727delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000011728splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000011729whitespace string is a separator and empty strings are\n\
11730removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011731
11732static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011733unicode_split(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011734{
11735 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011736 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011737
Martin v. Löwis18e16552006-02-15 17:27:45 +000011738 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011739 return NULL;
11740
11741 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011742 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011743 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020011744 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011745 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011746 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011747}
11748
Thomas Wouters477c8d52006-05-27 19:21:47 +000011749PyObject *
11750PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11751{
11752 PyObject* str_obj;
11753 PyObject* sep_obj;
11754 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011755 int kind1, kind2, kind;
11756 void *buf1 = NULL, *buf2 = NULL;
11757 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011758
11759 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020011760 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011761 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011762 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011763 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011764 Py_DECREF(str_obj);
11765 return NULL;
11766 }
11767
Victor Stinner14f8f022011-10-05 20:58:25 +020011768 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011769 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020011770 kind = Py_MAX(kind1, kind2);
11771 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011772 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020011773 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011774 if (!buf1)
11775 goto onError;
11776 buf2 = PyUnicode_DATA(sep_obj);
11777 if (kind2 != kind)
11778 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11779 if (!buf2)
11780 goto onError;
11781 len1 = PyUnicode_GET_LENGTH(str_obj);
11782 len2 = PyUnicode_GET_LENGTH(sep_obj);
11783
Victor Stinner14f8f022011-10-05 20:58:25 +020011784 switch(PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011785 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020011786 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
11787 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11788 else
11789 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011790 break;
11791 case PyUnicode_2BYTE_KIND:
11792 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11793 break;
11794 case PyUnicode_4BYTE_KIND:
11795 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11796 break;
11797 default:
11798 assert(0);
11799 out = 0;
11800 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011801
11802 Py_DECREF(sep_obj);
11803 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011804 if (kind1 != kind)
11805 PyMem_Free(buf1);
11806 if (kind2 != kind)
11807 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011808
11809 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011810 onError:
11811 Py_DECREF(sep_obj);
11812 Py_DECREF(str_obj);
11813 if (kind1 != kind && buf1)
11814 PyMem_Free(buf1);
11815 if (kind2 != kind && buf2)
11816 PyMem_Free(buf2);
11817 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011818}
11819
11820
11821PyObject *
11822PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11823{
11824 PyObject* str_obj;
11825 PyObject* sep_obj;
11826 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011827 int kind1, kind2, kind;
11828 void *buf1 = NULL, *buf2 = NULL;
11829 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011830
11831 str_obj = PyUnicode_FromObject(str_in);
11832 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011833 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011834 sep_obj = PyUnicode_FromObject(sep_in);
11835 if (!sep_obj) {
11836 Py_DECREF(str_obj);
11837 return NULL;
11838 }
11839
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011840 kind1 = PyUnicode_KIND(str_in);
11841 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011842 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011843 buf1 = PyUnicode_DATA(str_in);
11844 if (kind1 != kind)
11845 buf1 = _PyUnicode_AsKind(str_in, kind);
11846 if (!buf1)
11847 goto onError;
11848 buf2 = PyUnicode_DATA(sep_obj);
11849 if (kind2 != kind)
11850 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11851 if (!buf2)
11852 goto onError;
11853 len1 = PyUnicode_GET_LENGTH(str_obj);
11854 len2 = PyUnicode_GET_LENGTH(sep_obj);
11855
11856 switch(PyUnicode_KIND(str_in)) {
11857 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020011858 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
11859 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11860 else
11861 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011862 break;
11863 case PyUnicode_2BYTE_KIND:
11864 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11865 break;
11866 case PyUnicode_4BYTE_KIND:
11867 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11868 break;
11869 default:
11870 assert(0);
11871 out = 0;
11872 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011873
11874 Py_DECREF(sep_obj);
11875 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011876 if (kind1 != kind)
11877 PyMem_Free(buf1);
11878 if (kind2 != kind)
11879 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011880
11881 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011882 onError:
11883 Py_DECREF(sep_obj);
11884 Py_DECREF(str_obj);
11885 if (kind1 != kind && buf1)
11886 PyMem_Free(buf1);
11887 if (kind2 != kind && buf2)
11888 PyMem_Free(buf2);
11889 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011890}
11891
11892PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011893 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011894\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011895Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011896the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011897found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011898
11899static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011900unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011901{
Victor Stinner9310abb2011-10-05 00:59:23 +020011902 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011903}
11904
11905PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000011906 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011907\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011908Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011909the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011910separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011911
11912static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011913unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011914{
Victor Stinner9310abb2011-10-05 00:59:23 +020011915 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011916}
11917
Alexander Belopolsky40018472011-02-26 01:02:56 +000011918PyObject *
11919PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011920{
11921 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011922
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011923 s = PyUnicode_FromObject(s);
11924 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011925 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011926 if (sep != NULL) {
11927 sep = PyUnicode_FromObject(sep);
11928 if (sep == NULL) {
11929 Py_DECREF(s);
11930 return NULL;
11931 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011932 }
11933
Victor Stinner9310abb2011-10-05 00:59:23 +020011934 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011935
11936 Py_DECREF(s);
11937 Py_XDECREF(sep);
11938 return result;
11939}
11940
11941PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011942 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011943\n\
11944Return a list of the words in S, using sep as the\n\
11945delimiter string, starting at the end of the string and\n\
11946working to the front. If maxsplit is given, at most maxsplit\n\
11947splits are done. If sep is not specified, any whitespace string\n\
11948is a separator.");
11949
11950static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011951unicode_rsplit(PyObject *self, PyObject *args)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011952{
11953 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011954 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011955
Martin v. Löwis18e16552006-02-15 17:27:45 +000011956 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011957 return NULL;
11958
11959 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011960 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011961 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020011962 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011963 else
Victor Stinner9310abb2011-10-05 00:59:23 +020011964 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011965}
11966
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011967PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011968 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011969\n\
11970Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000011971Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011972is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011973
11974static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011975unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011976{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011977 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000011978 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011979
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011980 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11981 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011982 return NULL;
11983
Guido van Rossum86662912000-04-11 15:38:46 +000011984 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011985}
11986
11987static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000011988PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011989{
Walter Dörwald346737f2007-05-31 10:44:43 +000011990 if (PyUnicode_CheckExact(self)) {
11991 Py_INCREF(self);
11992 return self;
11993 } else
11994 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020011995 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011996}
11997
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011998PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011999 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012000\n\
12001Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012002and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012003
12004static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012005unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012006{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012007 return fixup(self, fixswapcase);
12008}
12009
Georg Brandlceee0772007-11-27 23:48:05 +000012010PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012011 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012012\n\
12013Return a translation table usable for str.translate().\n\
12014If there is only one argument, it must be a dictionary mapping Unicode\n\
12015ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012016Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012017If there are two arguments, they must be strings of equal length, and\n\
12018in the resulting dictionary, each character in x will be mapped to the\n\
12019character at the same position in y. If there is a third argument, it\n\
12020must be a string, whose characters will be mapped to None in the result.");
12021
12022static PyObject*
12023unicode_maketrans(PyUnicodeObject *null, PyObject *args)
12024{
12025 PyObject *x, *y = NULL, *z = NULL;
12026 PyObject *new = NULL, *key, *value;
12027 Py_ssize_t i = 0;
12028 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012029
Georg Brandlceee0772007-11-27 23:48:05 +000012030 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12031 return NULL;
12032 new = PyDict_New();
12033 if (!new)
12034 return NULL;
12035 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012036 int x_kind, y_kind, z_kind;
12037 void *x_data, *y_data, *z_data;
12038
Georg Brandlceee0772007-11-27 23:48:05 +000012039 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012040 if (!PyUnicode_Check(x)) {
12041 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12042 "be a string if there is a second argument");
12043 goto err;
12044 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012045 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012046 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12047 "arguments must have equal length");
12048 goto err;
12049 }
12050 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012051 x_kind = PyUnicode_KIND(x);
12052 y_kind = PyUnicode_KIND(y);
12053 x_data = PyUnicode_DATA(x);
12054 y_data = PyUnicode_DATA(y);
12055 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12056 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12057 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012058 if (!key || !value)
12059 goto err;
12060 res = PyDict_SetItem(new, key, value);
12061 Py_DECREF(key);
12062 Py_DECREF(value);
12063 if (res < 0)
12064 goto err;
12065 }
12066 /* create entries for deleting chars in z */
12067 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012068 z_kind = PyUnicode_KIND(z);
12069 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000012070 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012071 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012072 if (!key)
12073 goto err;
12074 res = PyDict_SetItem(new, key, Py_None);
12075 Py_DECREF(key);
12076 if (res < 0)
12077 goto err;
12078 }
12079 }
12080 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012081 int kind;
12082 void *data;
12083
Georg Brandlceee0772007-11-27 23:48:05 +000012084 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012085 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012086 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12087 "to maketrans it must be a dict");
12088 goto err;
12089 }
12090 /* copy entries into the new dict, converting string keys to int keys */
12091 while (PyDict_Next(x, &i, &key, &value)) {
12092 if (PyUnicode_Check(key)) {
12093 /* convert string keys to integer keys */
12094 PyObject *newkey;
12095 if (PyUnicode_GET_SIZE(key) != 1) {
12096 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12097 "table must be of length 1");
12098 goto err;
12099 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012100 kind = PyUnicode_KIND(key);
12101 data = PyUnicode_DATA(key);
12102 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012103 if (!newkey)
12104 goto err;
12105 res = PyDict_SetItem(new, newkey, value);
12106 Py_DECREF(newkey);
12107 if (res < 0)
12108 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012109 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012110 /* just keep integer keys */
12111 if (PyDict_SetItem(new, key, value) < 0)
12112 goto err;
12113 } else {
12114 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12115 "be strings or integers");
12116 goto err;
12117 }
12118 }
12119 }
12120 return new;
12121 err:
12122 Py_DECREF(new);
12123 return NULL;
12124}
12125
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012126PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012127 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012128\n\
12129Return a copy of the string S, where all characters have been mapped\n\
12130through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012131Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012132Unmapped characters are left untouched. Characters mapped to None\n\
12133are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012134
12135static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012136unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012137{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012138 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012139}
12140
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012141PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012142 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012143\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012144Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012145
12146static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012147unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012148{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012149 return fixup(self, fixupper);
12150}
12151
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012152PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012153 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012154\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012155Pad a numeric string S with zeros on the left, to fill a field\n\
12156of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012157
12158static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012159unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012160{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012161 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012162 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012163 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012164 int kind;
12165 void *data;
12166 Py_UCS4 chr;
12167
12168 if (PyUnicode_READY(self) == -1)
12169 return NULL;
12170
Martin v. Löwis18e16552006-02-15 17:27:45 +000012171 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012172 return NULL;
12173
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012174 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012175 if (PyUnicode_CheckExact(self)) {
12176 Py_INCREF(self);
12177 return (PyObject*) self;
12178 }
12179 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020012180 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012181 }
12182
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012183 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012184
12185 u = pad(self, fill, 0, '0');
12186
Walter Dörwald068325e2002-04-15 13:36:47 +000012187 if (u == NULL)
12188 return NULL;
12189
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012190 kind = PyUnicode_KIND(u);
12191 data = PyUnicode_DATA(u);
12192 chr = PyUnicode_READ(kind, data, fill);
12193
12194 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012195 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012196 PyUnicode_WRITE(kind, data, 0, chr);
12197 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012198 }
12199
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012200 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012201 return (PyObject*) u;
12202}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012203
12204#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012205static PyObject *
12206unicode__decimal2ascii(PyObject *self)
12207{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012208 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012209}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012210#endif
12211
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012212PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012213 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012214\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012215Return True if S starts with the specified prefix, False otherwise.\n\
12216With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012217With optional end, stop comparing S at that position.\n\
12218prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012219
12220static PyObject *
12221unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012222 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012223{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012224 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012225 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012226 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012227 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012228 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012229
Jesus Ceaac451502011-04-20 17:09:23 +020012230 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012231 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012232 if (PyTuple_Check(subobj)) {
12233 Py_ssize_t i;
12234 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12235 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012236 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012237 if (substring == NULL)
12238 return NULL;
12239 result = tailmatch(self, substring, start, end, -1);
12240 Py_DECREF(substring);
12241 if (result) {
12242 Py_RETURN_TRUE;
12243 }
12244 }
12245 /* nothing matched */
12246 Py_RETURN_FALSE;
12247 }
12248 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012249 if (substring == NULL) {
12250 if (PyErr_ExceptionMatches(PyExc_TypeError))
12251 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12252 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012253 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012254 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012255 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012256 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012257 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012258}
12259
12260
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012261PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012262 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012263\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012264Return True if S ends with the specified suffix, False otherwise.\n\
12265With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012266With optional end, stop comparing S at that position.\n\
12267suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012268
12269static PyObject *
12270unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012271 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012272{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012273 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012274 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012275 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012276 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012277 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012278
Jesus Ceaac451502011-04-20 17:09:23 +020012279 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012280 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012281 if (PyTuple_Check(subobj)) {
12282 Py_ssize_t i;
12283 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12284 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012285 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012286 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012287 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012288 result = tailmatch(self, substring, start, end, +1);
12289 Py_DECREF(substring);
12290 if (result) {
12291 Py_RETURN_TRUE;
12292 }
12293 }
12294 Py_RETURN_FALSE;
12295 }
12296 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012297 if (substring == NULL) {
12298 if (PyErr_ExceptionMatches(PyExc_TypeError))
12299 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12300 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012301 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012302 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012303 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012304 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012305 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012306}
12307
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012308#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012309
12310PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012311 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012312\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012313Return a formatted version of S, using substitutions from args and kwargs.\n\
12314The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012315
Eric Smith27bbca62010-11-04 17:06:58 +000012316PyDoc_STRVAR(format_map__doc__,
12317 "S.format_map(mapping) -> str\n\
12318\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012319Return a formatted version of S, using substitutions from mapping.\n\
12320The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012321
Eric Smith4a7d76d2008-05-30 18:10:19 +000012322static PyObject *
12323unicode__format__(PyObject* self, PyObject* args)
12324{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012325 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012326
12327 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12328 return NULL;
12329
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012330 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012331 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012332 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012333}
12334
Eric Smith8c663262007-08-25 02:26:07 +000012335PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012336 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012337\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012338Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012339
12340static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012341unicode__sizeof__(PyUnicodeObject *v)
12342{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012343 Py_ssize_t size;
12344
12345 /* If it's a compact object, account for base structure +
12346 character data. */
12347 if (PyUnicode_IS_COMPACT_ASCII(v))
12348 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12349 else if (PyUnicode_IS_COMPACT(v))
12350 size = sizeof(PyCompactUnicodeObject) +
12351 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
12352 else {
12353 /* If it is a two-block object, account for base object, and
12354 for character block if present. */
12355 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012356 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012357 size += (PyUnicode_GET_LENGTH(v) + 1) *
12358 PyUnicode_CHARACTER_SIZE(v);
12359 }
12360 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012361 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012362 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012363 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012364 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012365 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012366
12367 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012368}
12369
12370PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012371 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012372
12373static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012374unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012375{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012376 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012377 if (!copy)
12378 return NULL;
12379 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012380}
12381
Guido van Rossumd57fd912000-03-10 22:53:23 +000012382static PyMethodDef unicode_methods[] = {
12383
12384 /* Order is according to common usage: often used methods should
12385 appear first, since lookup is done sequentially. */
12386
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012387 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012388 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12389 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012390 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012391 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12392 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12393 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12394 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12395 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12396 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12397 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012398 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012399 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12400 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12401 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012402 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012403 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12404 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12405 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012406 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012407 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012408 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012409 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012410 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12411 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12412 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12413 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12414 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12415 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12416 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12417 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12418 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12419 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12420 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12421 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12422 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12423 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012424 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012425 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012426 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012427 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012428 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012429 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012430 {"maketrans", (PyCFunction) unicode_maketrans,
12431 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012432 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012433#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012434 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012435#endif
12436
12437#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012438 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012439 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012440#endif
12441
Benjamin Peterson14339b62009-01-31 16:36:08 +000012442 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012443 {NULL, NULL}
12444};
12445
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012446static PyObject *
12447unicode_mod(PyObject *v, PyObject *w)
12448{
Brian Curtindfc80e32011-08-10 20:28:54 -050012449 if (!PyUnicode_Check(v))
12450 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012451 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012452}
12453
12454static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012455 0, /*nb_add*/
12456 0, /*nb_subtract*/
12457 0, /*nb_multiply*/
12458 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012459};
12460
Guido van Rossumd57fd912000-03-10 22:53:23 +000012461static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012462 (lenfunc) unicode_length, /* sq_length */
12463 PyUnicode_Concat, /* sq_concat */
12464 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12465 (ssizeargfunc) unicode_getitem, /* sq_item */
12466 0, /* sq_slice */
12467 0, /* sq_ass_item */
12468 0, /* sq_ass_slice */
12469 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012470};
12471
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012472static PyObject*
12473unicode_subscript(PyUnicodeObject* self, PyObject* item)
12474{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012475 if (PyUnicode_READY(self) == -1)
12476 return NULL;
12477
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000012478 if (PyIndex_Check(item)) {
12479 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012480 if (i == -1 && PyErr_Occurred())
12481 return NULL;
12482 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012483 i += PyUnicode_GET_LENGTH(self);
Victor Stinner2fe5ced2011-10-02 00:25:40 +020012484 return unicode_getitem((PyObject*)self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012485 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000012486 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012487 PyObject *result;
12488 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012489 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012490 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012491
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012492 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000012493 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012494 return NULL;
12495 }
12496
12497 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012498 return PyUnicode_New(0, 0);
12499 } else if (start == 0 && step == 1 &&
12500 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000012501 PyUnicode_CheckExact(self)) {
12502 Py_INCREF(self);
12503 return (PyObject *)self;
12504 } else if (step == 1) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012505 return PyUnicode_Substring((PyObject*)self,
12506 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012507 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012508 /* General case */
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012509 max_char = 0;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012510 src_kind = PyUnicode_KIND(self);
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012511 kind_limit = kind_maxchar_limit(src_kind);
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012512 src_data = PyUnicode_DATA(self);
12513 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12514 ch = PyUnicode_READ(src_kind, src_data, cur);
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012515 if (ch > max_char) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012516 max_char = ch;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012517 if (max_char >= kind_limit)
12518 break;
12519 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012520 }
12521 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012522 if (result == NULL)
12523 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012524 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012525 dest_data = PyUnicode_DATA(result);
12526
12527 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012528 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
12529 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012530 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012531 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012532 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012533 } else {
12534 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12535 return NULL;
12536 }
12537}
12538
12539static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012540 (lenfunc)unicode_length, /* mp_length */
12541 (binaryfunc)unicode_subscript, /* mp_subscript */
12542 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012543};
12544
Guido van Rossumd57fd912000-03-10 22:53:23 +000012545
Guido van Rossumd57fd912000-03-10 22:53:23 +000012546/* Helpers for PyUnicode_Format() */
12547
12548static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000012549getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012550{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012551 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012552 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012553 (*p_argidx)++;
12554 if (arglen < 0)
12555 return args;
12556 else
12557 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012558 }
12559 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012560 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012561 return NULL;
12562}
12563
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012564/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012565
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012566static PyObject *
12567formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012568{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012569 char *p;
12570 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012571 double x;
Tim Petersced69f82003-09-16 20:30:58 +000012572
Guido van Rossumd57fd912000-03-10 22:53:23 +000012573 x = PyFloat_AsDouble(v);
12574 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012575 return NULL;
12576
Guido van Rossumd57fd912000-03-10 22:53:23 +000012577 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012578 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000012579
Eric Smith0923d1d2009-04-16 20:16:10 +000012580 p = PyOS_double_to_string(x, type, prec,
12581 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012582 if (p == NULL)
12583 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012584 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000012585 PyMem_Free(p);
12586 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012587}
12588
Tim Peters38fd5b62000-09-21 05:43:11 +000012589static PyObject*
12590formatlong(PyObject *val, int flags, int prec, int type)
12591{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012592 char *buf;
12593 int len;
12594 PyObject *str; /* temporary string object. */
12595 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012596
Benjamin Peterson14339b62009-01-31 16:36:08 +000012597 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
12598 if (!str)
12599 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012600 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012601 Py_DECREF(str);
12602 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012603}
12604
Guido van Rossumd57fd912000-03-10 22:53:23 +000012605static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012606formatchar(Py_UCS4 *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012607 size_t buflen,
12608 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012609{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012610 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012611 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012612 if (PyUnicode_GET_LENGTH(v) == 1) {
12613 buf[0] = PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000012614 buf[1] = '\0';
12615 return 1;
12616 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012617 goto onError;
12618 }
12619 else {
12620 /* Integer input truncated to a character */
12621 long x;
12622 x = PyLong_AsLong(v);
12623 if (x == -1 && PyErr_Occurred())
12624 goto onError;
12625
12626 if (x < 0 || x > 0x10ffff) {
12627 PyErr_SetString(PyExc_OverflowError,
12628 "%c arg not in range(0x110000)");
12629 return -1;
12630 }
12631
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012632 buf[0] = (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012633 buf[1] = '\0';
12634 return 1;
12635 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012636
Benjamin Peterson29060642009-01-31 22:14:21 +000012637 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012638 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012639 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012640 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012641}
12642
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012643/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012644 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012645*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012646#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012647
Alexander Belopolsky40018472011-02-26 01:02:56 +000012648PyObject *
12649PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012650{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012651 void *fmt;
12652 int fmtkind;
12653 PyObject *result;
12654 Py_UCS4 *res, *res0;
12655 Py_UCS4 max;
12656 int kind;
12657 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012658 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012659 PyObject *dict = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012660 PyUnicodeObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +000012661
Guido van Rossumd57fd912000-03-10 22:53:23 +000012662 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012663 PyErr_BadInternalCall();
12664 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012665 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012666 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
12667 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012668 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012669 fmt = PyUnicode_DATA(uformat);
12670 fmtkind = PyUnicode_KIND(uformat);
12671 fmtcnt = PyUnicode_GET_LENGTH(uformat);
12672 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012673
12674 reslen = rescnt = fmtcnt + 100;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012675 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
12676 if (res0 == NULL) {
12677 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012678 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012679 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012680
12681 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012682 arglen = PyTuple_Size(args);
12683 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012684 }
12685 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012686 arglen = -1;
12687 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012688 }
Christian Heimes90aa7642007-12-19 02:45:37 +000012689 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000012690 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000012691 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012692
12693 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012694 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012695 if (--rescnt < 0) {
12696 rescnt = fmtcnt + 100;
12697 reslen += rescnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012698 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12699 if (res0 == NULL){
12700 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012701 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012702 }
12703 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012704 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012705 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012706 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012707 }
12708 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012709 /* Got a format specifier */
12710 int flags = 0;
12711 Py_ssize_t width = -1;
12712 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012713 Py_UCS4 c = '\0';
12714 Py_UCS4 fill;
Benjamin Peterson29060642009-01-31 22:14:21 +000012715 int isnumok;
12716 PyObject *v = NULL;
12717 PyObject *temp = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012718 void *pbuf;
12719 Py_ssize_t pindex;
Benjamin Peterson29060642009-01-31 22:14:21 +000012720 Py_UNICODE sign;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012721 Py_ssize_t len, len1;
12722 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012723
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012724 fmtpos++;
12725 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12726 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000012727 Py_ssize_t keylen;
12728 PyObject *key;
12729 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000012730
Benjamin Peterson29060642009-01-31 22:14:21 +000012731 if (dict == NULL) {
12732 PyErr_SetString(PyExc_TypeError,
12733 "format requires a mapping");
12734 goto onError;
12735 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012736 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012737 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012738 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012739 /* Skip over balanced parentheses */
12740 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012741 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012742 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012743 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012744 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012745 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012746 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012747 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012748 if (fmtcnt < 0 || pcount > 0) {
12749 PyErr_SetString(PyExc_ValueError,
12750 "incomplete format key");
12751 goto onError;
12752 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020012753 key = PyUnicode_Substring((PyObject*)uformat,
12754 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012755 if (key == NULL)
12756 goto onError;
12757 if (args_owned) {
12758 Py_DECREF(args);
12759 args_owned = 0;
12760 }
12761 args = PyObject_GetItem(dict, key);
12762 Py_DECREF(key);
12763 if (args == NULL) {
12764 goto onError;
12765 }
12766 args_owned = 1;
12767 arglen = -1;
12768 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012769 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012770 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012771 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012772 case '-': flags |= F_LJUST; continue;
12773 case '+': flags |= F_SIGN; continue;
12774 case ' ': flags |= F_BLANK; continue;
12775 case '#': flags |= F_ALT; continue;
12776 case '0': flags |= F_ZERO; continue;
12777 }
12778 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012779 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012780 if (c == '*') {
12781 v = getnextarg(args, arglen, &argidx);
12782 if (v == NULL)
12783 goto onError;
12784 if (!PyLong_Check(v)) {
12785 PyErr_SetString(PyExc_TypeError,
12786 "* wants int");
12787 goto onError;
12788 }
12789 width = PyLong_AsLong(v);
12790 if (width == -1 && PyErr_Occurred())
12791 goto onError;
12792 if (width < 0) {
12793 flags |= F_LJUST;
12794 width = -width;
12795 }
12796 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012797 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012798 }
12799 else if (c >= '0' && c <= '9') {
12800 width = c - '0';
12801 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012802 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012803 if (c < '0' || c > '9')
12804 break;
12805 if ((width*10) / 10 != width) {
12806 PyErr_SetString(PyExc_ValueError,
12807 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012808 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012809 }
12810 width = width*10 + (c - '0');
12811 }
12812 }
12813 if (c == '.') {
12814 prec = 0;
12815 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012816 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012817 if (c == '*') {
12818 v = getnextarg(args, arglen, &argidx);
12819 if (v == NULL)
12820 goto onError;
12821 if (!PyLong_Check(v)) {
12822 PyErr_SetString(PyExc_TypeError,
12823 "* wants int");
12824 goto onError;
12825 }
12826 prec = PyLong_AsLong(v);
12827 if (prec == -1 && PyErr_Occurred())
12828 goto onError;
12829 if (prec < 0)
12830 prec = 0;
12831 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012832 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012833 }
12834 else if (c >= '0' && c <= '9') {
12835 prec = c - '0';
12836 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012837 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012838 if (c < '0' || c > '9')
12839 break;
12840 if ((prec*10) / 10 != prec) {
12841 PyErr_SetString(PyExc_ValueError,
12842 "prec too big");
12843 goto onError;
12844 }
12845 prec = prec*10 + (c - '0');
12846 }
12847 }
12848 } /* prec */
12849 if (fmtcnt >= 0) {
12850 if (c == 'h' || c == 'l' || c == 'L') {
12851 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012852 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012853 }
12854 }
12855 if (fmtcnt < 0) {
12856 PyErr_SetString(PyExc_ValueError,
12857 "incomplete format");
12858 goto onError;
12859 }
12860 if (c != '%') {
12861 v = getnextarg(args, arglen, &argidx);
12862 if (v == NULL)
12863 goto onError;
12864 }
12865 sign = 0;
12866 fill = ' ';
12867 switch (c) {
12868
12869 case '%':
12870 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012871 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012872 /* presume that buffer length is at least 1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012873 PyUnicode_WRITE(kind, pbuf, 0, '%');
Benjamin Peterson29060642009-01-31 22:14:21 +000012874 len = 1;
12875 break;
12876
12877 case 's':
12878 case 'r':
12879 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012880 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012881 temp = v;
12882 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012883 }
12884 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012885 if (c == 's')
12886 temp = PyObject_Str(v);
12887 else if (c == 'r')
12888 temp = PyObject_Repr(v);
12889 else
12890 temp = PyObject_ASCII(v);
12891 if (temp == NULL)
12892 goto onError;
12893 if (PyUnicode_Check(temp))
12894 /* nothing to do */;
12895 else {
12896 Py_DECREF(temp);
12897 PyErr_SetString(PyExc_TypeError,
12898 "%s argument has non-string str()");
12899 goto onError;
12900 }
12901 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012902 if (PyUnicode_READY(temp) == -1) {
12903 Py_CLEAR(temp);
12904 goto onError;
12905 }
12906 pbuf = PyUnicode_DATA(temp);
12907 kind = PyUnicode_KIND(temp);
12908 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012909 if (prec >= 0 && len > prec)
12910 len = prec;
12911 break;
12912
12913 case 'i':
12914 case 'd':
12915 case 'u':
12916 case 'o':
12917 case 'x':
12918 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000012919 isnumok = 0;
12920 if (PyNumber_Check(v)) {
12921 PyObject *iobj=NULL;
12922
12923 if (PyLong_Check(v)) {
12924 iobj = v;
12925 Py_INCREF(iobj);
12926 }
12927 else {
12928 iobj = PyNumber_Long(v);
12929 }
12930 if (iobj!=NULL) {
12931 if (PyLong_Check(iobj)) {
12932 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070012933 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000012934 Py_DECREF(iobj);
12935 if (!temp)
12936 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012937 if (PyUnicode_READY(temp) == -1) {
12938 Py_CLEAR(temp);
12939 goto onError;
12940 }
12941 pbuf = PyUnicode_DATA(temp);
12942 kind = PyUnicode_KIND(temp);
12943 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012944 sign = 1;
12945 }
12946 else {
12947 Py_DECREF(iobj);
12948 }
12949 }
12950 }
12951 if (!isnumok) {
12952 PyErr_Format(PyExc_TypeError,
12953 "%%%c format: a number is required, "
12954 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12955 goto onError;
12956 }
12957 if (flags & F_ZERO)
12958 fill = '0';
12959 break;
12960
12961 case 'e':
12962 case 'E':
12963 case 'f':
12964 case 'F':
12965 case 'g':
12966 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012967 temp = formatfloat(v, flags, prec, c);
12968 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000012969 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012970 if (PyUnicode_READY(temp) == -1) {
12971 Py_CLEAR(temp);
12972 goto onError;
12973 }
12974 pbuf = PyUnicode_DATA(temp);
12975 kind = PyUnicode_KIND(temp);
12976 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012977 sign = 1;
12978 if (flags & F_ZERO)
12979 fill = '0';
12980 break;
12981
12982 case 'c':
12983 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012984 kind = PyUnicode_4BYTE_KIND;
Victor Stinnerb9dcffb2011-09-29 00:39:24 +020012985 len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v);
Benjamin Peterson29060642009-01-31 22:14:21 +000012986 if (len < 0)
12987 goto onError;
12988 break;
12989
12990 default:
12991 PyErr_Format(PyExc_ValueError,
12992 "unsupported format character '%c' (0x%x) "
12993 "at index %zd",
12994 (31<=c && c<=126) ? (char)c : '?',
12995 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012996 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000012997 goto onError;
12998 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012999 /* pbuf is initialized here. */
13000 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013001 if (sign) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013002 if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
13003 PyUnicode_READ(kind, pbuf, pindex) == '+') {
13004 sign = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013005 len--;
13006 }
13007 else if (flags & F_SIGN)
13008 sign = '+';
13009 else if (flags & F_BLANK)
13010 sign = ' ';
13011 else
13012 sign = 0;
13013 }
13014 if (width < len)
13015 width = len;
13016 if (rescnt - (sign != 0) < width) {
13017 reslen -= rescnt;
13018 rescnt = width + fmtcnt + 100;
13019 reslen += rescnt;
13020 if (reslen < 0) {
13021 Py_XDECREF(temp);
13022 PyErr_NoMemory();
13023 goto onError;
13024 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013025 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
13026 if (res0 == 0) {
13027 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000013028 Py_XDECREF(temp);
13029 goto onError;
13030 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013031 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000013032 }
13033 if (sign) {
13034 if (fill != ' ')
13035 *res++ = sign;
13036 rescnt--;
13037 if (width > len)
13038 width--;
13039 }
13040 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013041 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13042 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013043 if (fill != ' ') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013044 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
13045 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013046 }
13047 rescnt -= 2;
13048 width -= 2;
13049 if (width < 0)
13050 width = 0;
13051 len -= 2;
13052 }
13053 if (width > len && !(flags & F_LJUST)) {
13054 do {
13055 --rescnt;
13056 *res++ = fill;
13057 } while (--width > len);
13058 }
13059 if (fill == ' ') {
13060 if (sign)
13061 *res++ = sign;
13062 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013063 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13064 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
13065 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
13066 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013067 }
13068 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013069 /* Copy all characters, preserving len */
13070 len1 = len;
13071 while (len1--) {
13072 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
13073 rescnt--;
13074 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013075 while (--width >= len) {
13076 --rescnt;
13077 *res++ = ' ';
13078 }
13079 if (dict && (argidx < arglen) && c != '%') {
13080 PyErr_SetString(PyExc_TypeError,
13081 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +000013082 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013083 goto onError;
13084 }
13085 Py_XDECREF(temp);
13086 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013087 } /* until end */
13088 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013089 PyErr_SetString(PyExc_TypeError,
13090 "not all arguments converted during string formatting");
13091 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013092 }
13093
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013094
13095 for (max=0, res = res0; res < res0+reslen-rescnt; res++)
13096 if (*res > max)
13097 max = *res;
13098 result = PyUnicode_New(reslen - rescnt, max);
13099 if (!result)
Benjamin Peterson29060642009-01-31 22:14:21 +000013100 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013101 kind = PyUnicode_KIND(result);
13102 for (res = res0; res < res0+reslen-rescnt; res++)
13103 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
13104 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013105 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013106 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013107 }
13108 Py_DECREF(uformat);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013109 assert(_PyUnicode_CheckConsistency(result, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000013110 return (PyObject *)result;
13111
Benjamin Peterson29060642009-01-31 22:14:21 +000013112 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013113 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013114 Py_DECREF(uformat);
13115 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013116 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013117 }
13118 return NULL;
13119}
13120
Jeremy Hylton938ace62002-07-17 16:30:39 +000013121static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013122unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13123
Tim Peters6d6c1a32001-08-02 04:15:00 +000013124static PyObject *
13125unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13126{
Benjamin Peterson29060642009-01-31 22:14:21 +000013127 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013128 static char *kwlist[] = {"object", "encoding", "errors", 0};
13129 char *encoding = NULL;
13130 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013131
Benjamin Peterson14339b62009-01-31 16:36:08 +000013132 if (type != &PyUnicode_Type)
13133 return unicode_subtype_new(type, args, kwds);
13134 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013135 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013136 return NULL;
13137 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013138 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013139 if (encoding == NULL && errors == NULL)
13140 return PyObject_Str(x);
13141 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013142 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013143}
13144
Guido van Rossume023fe02001-08-30 03:12:59 +000013145static PyObject *
13146unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13147{
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013148 PyUnicodeObject *unicode, *self;
13149 Py_ssize_t length, char_size;
13150 int share_wstr, share_utf8;
13151 unsigned int kind;
13152 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013153
Benjamin Peterson14339b62009-01-31 16:36:08 +000013154 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013155
13156 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
13157 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013158 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013159 assert(_PyUnicode_CHECK(unicode));
Victor Stinnere06e1452011-10-04 20:52:31 +020013160 if (PyUnicode_READY(unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013161 return NULL;
13162
13163 self = (PyUnicodeObject *) type->tp_alloc(type, 0);
13164 if (self == NULL) {
13165 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013166 return NULL;
13167 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013168 kind = PyUnicode_KIND(unicode);
13169 length = PyUnicode_GET_LENGTH(unicode);
13170
13171 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013172#ifdef Py_DEBUG
13173 _PyUnicode_HASH(self) = -1;
13174#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013175 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013176#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013177 _PyUnicode_STATE(self).interned = 0;
13178 _PyUnicode_STATE(self).kind = kind;
13179 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013180 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013181 _PyUnicode_STATE(self).ready = 1;
13182 _PyUnicode_WSTR(self) = NULL;
13183 _PyUnicode_UTF8_LENGTH(self) = 0;
13184 _PyUnicode_UTF8(self) = NULL;
13185 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013186 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013187
13188 share_utf8 = 0;
13189 share_wstr = 0;
13190 if (kind == PyUnicode_1BYTE_KIND) {
13191 char_size = 1;
13192 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13193 share_utf8 = 1;
13194 }
13195 else if (kind == PyUnicode_2BYTE_KIND) {
13196 char_size = 2;
13197 if (sizeof(wchar_t) == 2)
13198 share_wstr = 1;
13199 }
13200 else {
13201 assert(kind == PyUnicode_4BYTE_KIND);
13202 char_size = 4;
13203 if (sizeof(wchar_t) == 4)
13204 share_wstr = 1;
13205 }
13206
13207 /* Ensure we won't overflow the length. */
13208 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13209 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013210 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013211 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013212 data = PyObject_MALLOC((length + 1) * char_size);
13213 if (data == NULL) {
13214 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013215 goto onError;
13216 }
13217
Victor Stinnerc3c74152011-10-02 20:39:55 +020013218 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013219 if (share_utf8) {
13220 _PyUnicode_UTF8_LENGTH(self) = length;
13221 _PyUnicode_UTF8(self) = data;
13222 }
13223 if (share_wstr) {
13224 _PyUnicode_WSTR_LENGTH(self) = length;
13225 _PyUnicode_WSTR(self) = (wchar_t *)data;
13226 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013227
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013228 Py_MEMCPY(data, PyUnicode_DATA(unicode),
13229 PyUnicode_KIND_SIZE(kind, length + 1));
13230 Py_DECREF(unicode);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013231 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013232#ifdef Py_DEBUG
13233 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13234#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013235 return (PyObject *)self;
13236
13237onError:
13238 Py_DECREF(unicode);
13239 Py_DECREF(self);
13240 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013241}
13242
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013243PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013244 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013245\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013246Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013247encoding defaults to the current default string encoding.\n\
13248errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013249
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013250static PyObject *unicode_iter(PyObject *seq);
13251
Guido van Rossumd57fd912000-03-10 22:53:23 +000013252PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013253 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013254 "str", /* tp_name */
13255 sizeof(PyUnicodeObject), /* tp_size */
13256 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013257 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013258 (destructor)unicode_dealloc, /* tp_dealloc */
13259 0, /* tp_print */
13260 0, /* tp_getattr */
13261 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013262 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013263 unicode_repr, /* tp_repr */
13264 &unicode_as_number, /* tp_as_number */
13265 &unicode_as_sequence, /* tp_as_sequence */
13266 &unicode_as_mapping, /* tp_as_mapping */
13267 (hashfunc) unicode_hash, /* tp_hash*/
13268 0, /* tp_call*/
13269 (reprfunc) unicode_str, /* tp_str */
13270 PyObject_GenericGetAttr, /* tp_getattro */
13271 0, /* tp_setattro */
13272 0, /* tp_as_buffer */
13273 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013274 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013275 unicode_doc, /* tp_doc */
13276 0, /* tp_traverse */
13277 0, /* tp_clear */
13278 PyUnicode_RichCompare, /* tp_richcompare */
13279 0, /* tp_weaklistoffset */
13280 unicode_iter, /* tp_iter */
13281 0, /* tp_iternext */
13282 unicode_methods, /* tp_methods */
13283 0, /* tp_members */
13284 0, /* tp_getset */
13285 &PyBaseObject_Type, /* tp_base */
13286 0, /* tp_dict */
13287 0, /* tp_descr_get */
13288 0, /* tp_descr_set */
13289 0, /* tp_dictoffset */
13290 0, /* tp_init */
13291 0, /* tp_alloc */
13292 unicode_new, /* tp_new */
13293 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013294};
13295
13296/* Initialize the Unicode implementation */
13297
Thomas Wouters78890102000-07-22 19:25:51 +000013298void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013299{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013300 int i;
13301
Thomas Wouters477c8d52006-05-27 19:21:47 +000013302 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013303 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013304 0x000A, /* LINE FEED */
13305 0x000D, /* CARRIAGE RETURN */
13306 0x001C, /* FILE SEPARATOR */
13307 0x001D, /* GROUP SEPARATOR */
13308 0x001E, /* RECORD SEPARATOR */
13309 0x0085, /* NEXT LINE */
13310 0x2028, /* LINE SEPARATOR */
13311 0x2029, /* PARAGRAPH SEPARATOR */
13312 };
13313
Fred Drakee4315f52000-05-09 19:53:39 +000013314 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013315 unicode_empty = PyUnicode_New(0, 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013316 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013317 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013318 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013319
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013320 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013321 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013322 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013323 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013324
13325 /* initialize the linebreak bloom filter */
13326 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013327 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013328 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013329
13330 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013331}
13332
13333/* Finalize the Unicode implementation */
13334
Christian Heimesa156e092008-02-16 07:38:31 +000013335int
13336PyUnicode_ClearFreeList(void)
13337{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013338 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013339}
13340
Guido van Rossumd57fd912000-03-10 22:53:23 +000013341void
Thomas Wouters78890102000-07-22 19:25:51 +000013342_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013343{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013344 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013345
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013346 Py_XDECREF(unicode_empty);
13347 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013348
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013349 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013350 if (unicode_latin1[i]) {
13351 Py_DECREF(unicode_latin1[i]);
13352 unicode_latin1[i] = NULL;
13353 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013354 }
Christian Heimesa156e092008-02-16 07:38:31 +000013355 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013356}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013357
Walter Dörwald16807132007-05-25 13:52:07 +000013358void
13359PyUnicode_InternInPlace(PyObject **p)
13360{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013361 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
13362 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013363#ifdef Py_DEBUG
13364 assert(s != NULL);
13365 assert(_PyUnicode_CHECK(s));
13366#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013367 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013368 return;
13369#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013370 /* If it's a subclass, we don't really know what putting
13371 it in the interned dict might do. */
13372 if (!PyUnicode_CheckExact(s))
13373 return;
13374 if (PyUnicode_CHECK_INTERNED(s))
13375 return;
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013376 if (_PyUnicode_READY_REPLACE(p)) {
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013377 assert(0 && "_PyUnicode_READY_REPLACE fail in PyUnicode_InternInPlace");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013378 return;
13379 }
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013380 s = (PyUnicodeObject *)(*p);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013381 if (interned == NULL) {
13382 interned = PyDict_New();
13383 if (interned == NULL) {
13384 PyErr_Clear(); /* Don't leave an exception */
13385 return;
13386 }
13387 }
13388 /* It might be that the GetItem call fails even
13389 though the key is present in the dictionary,
13390 namely when this happens during a stack overflow. */
13391 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000013392 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013393 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000013394
Benjamin Peterson29060642009-01-31 22:14:21 +000013395 if (t) {
13396 Py_INCREF(t);
13397 Py_DECREF(*p);
13398 *p = t;
13399 return;
13400 }
Walter Dörwald16807132007-05-25 13:52:07 +000013401
Benjamin Peterson14339b62009-01-31 16:36:08 +000013402 PyThreadState_GET()->recursion_critical = 1;
13403 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
13404 PyErr_Clear();
13405 PyThreadState_GET()->recursion_critical = 0;
13406 return;
13407 }
13408 PyThreadState_GET()->recursion_critical = 0;
13409 /* The two references in interned are not counted by refcnt.
13410 The deallocator will take care of this */
13411 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013412 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013413}
13414
13415void
13416PyUnicode_InternImmortal(PyObject **p)
13417{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013418 PyUnicodeObject *u = (PyUnicodeObject *)*p;
13419
Benjamin Peterson14339b62009-01-31 16:36:08 +000013420 PyUnicode_InternInPlace(p);
13421 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013422 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013423 Py_INCREF(*p);
13424 }
Walter Dörwald16807132007-05-25 13:52:07 +000013425}
13426
13427PyObject *
13428PyUnicode_InternFromString(const char *cp)
13429{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013430 PyObject *s = PyUnicode_FromString(cp);
13431 if (s == NULL)
13432 return NULL;
13433 PyUnicode_InternInPlace(&s);
13434 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000013435}
13436
Alexander Belopolsky40018472011-02-26 01:02:56 +000013437void
13438_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000013439{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013440 PyObject *keys;
13441 PyUnicodeObject *s;
13442 Py_ssize_t i, n;
13443 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000013444
Benjamin Peterson14339b62009-01-31 16:36:08 +000013445 if (interned == NULL || !PyDict_Check(interned))
13446 return;
13447 keys = PyDict_Keys(interned);
13448 if (keys == NULL || !PyList_Check(keys)) {
13449 PyErr_Clear();
13450 return;
13451 }
Walter Dörwald16807132007-05-25 13:52:07 +000013452
Benjamin Peterson14339b62009-01-31 16:36:08 +000013453 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13454 detector, interned unicode strings are not forcibly deallocated;
13455 rather, we give them their stolen references back, and then clear
13456 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000013457
Benjamin Peterson14339b62009-01-31 16:36:08 +000013458 n = PyList_GET_SIZE(keys);
13459 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000013460 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013461 for (i = 0; i < n; i++) {
13462 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013463 if (PyUnicode_READY(s) == -1) {
13464 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013465 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013466 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013467 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013468 case SSTATE_NOT_INTERNED:
13469 /* XXX Shouldn't happen */
13470 break;
13471 case SSTATE_INTERNED_IMMORTAL:
13472 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013473 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013474 break;
13475 case SSTATE_INTERNED_MORTAL:
13476 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013477 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013478 break;
13479 default:
13480 Py_FatalError("Inconsistent interned string state.");
13481 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013482 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013483 }
13484 fprintf(stderr, "total size of all interned strings: "
13485 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13486 "mortal/immortal\n", mortal_size, immortal_size);
13487 Py_DECREF(keys);
13488 PyDict_Clear(interned);
13489 Py_DECREF(interned);
13490 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000013491}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013492
13493
13494/********************* Unicode Iterator **************************/
13495
13496typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013497 PyObject_HEAD
13498 Py_ssize_t it_index;
13499 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013500} unicodeiterobject;
13501
13502static void
13503unicodeiter_dealloc(unicodeiterobject *it)
13504{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013505 _PyObject_GC_UNTRACK(it);
13506 Py_XDECREF(it->it_seq);
13507 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013508}
13509
13510static int
13511unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13512{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013513 Py_VISIT(it->it_seq);
13514 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013515}
13516
13517static PyObject *
13518unicodeiter_next(unicodeiterobject *it)
13519{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013520 PyUnicodeObject *seq;
13521 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013522
Benjamin Peterson14339b62009-01-31 16:36:08 +000013523 assert(it != NULL);
13524 seq = it->it_seq;
13525 if (seq == NULL)
13526 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013527 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013528
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013529 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
13530 int kind = PyUnicode_KIND(seq);
13531 void *data = PyUnicode_DATA(seq);
13532 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
13533 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013534 if (item != NULL)
13535 ++it->it_index;
13536 return item;
13537 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013538
Benjamin Peterson14339b62009-01-31 16:36:08 +000013539 Py_DECREF(seq);
13540 it->it_seq = NULL;
13541 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013542}
13543
13544static PyObject *
13545unicodeiter_len(unicodeiterobject *it)
13546{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013547 Py_ssize_t len = 0;
13548 if (it->it_seq)
13549 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
13550 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013551}
13552
13553PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
13554
13555static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013556 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000013557 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000013558 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013559};
13560
13561PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013562 PyVarObject_HEAD_INIT(&PyType_Type, 0)
13563 "str_iterator", /* tp_name */
13564 sizeof(unicodeiterobject), /* tp_basicsize */
13565 0, /* tp_itemsize */
13566 /* methods */
13567 (destructor)unicodeiter_dealloc, /* tp_dealloc */
13568 0, /* tp_print */
13569 0, /* tp_getattr */
13570 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013571 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013572 0, /* tp_repr */
13573 0, /* tp_as_number */
13574 0, /* tp_as_sequence */
13575 0, /* tp_as_mapping */
13576 0, /* tp_hash */
13577 0, /* tp_call */
13578 0, /* tp_str */
13579 PyObject_GenericGetAttr, /* tp_getattro */
13580 0, /* tp_setattro */
13581 0, /* tp_as_buffer */
13582 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
13583 0, /* tp_doc */
13584 (traverseproc)unicodeiter_traverse, /* tp_traverse */
13585 0, /* tp_clear */
13586 0, /* tp_richcompare */
13587 0, /* tp_weaklistoffset */
13588 PyObject_SelfIter, /* tp_iter */
13589 (iternextfunc)unicodeiter_next, /* tp_iternext */
13590 unicodeiter_methods, /* tp_methods */
13591 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013592};
13593
13594static PyObject *
13595unicode_iter(PyObject *seq)
13596{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013597 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013598
Benjamin Peterson14339b62009-01-31 16:36:08 +000013599 if (!PyUnicode_Check(seq)) {
13600 PyErr_BadInternalCall();
13601 return NULL;
13602 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013603 if (PyUnicode_READY(seq) == -1)
13604 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013605 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
13606 if (it == NULL)
13607 return NULL;
13608 it->it_index = 0;
13609 Py_INCREF(seq);
13610 it->it_seq = (PyUnicodeObject *)seq;
13611 _PyObject_GC_TRACK(it);
13612 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013613}
13614
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013615#define UNIOP(x) Py_UNICODE_##x
13616#define UNIOP_t Py_UNICODE
13617#include "uniops.h"
13618#undef UNIOP
13619#undef UNIOP_t
13620#define UNIOP(x) Py_UCS4_##x
13621#define UNIOP_t Py_UCS4
13622#include "uniops.h"
13623#undef UNIOP
13624#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000013625
Victor Stinner71133ff2010-09-01 23:43:53 +000013626Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000013627PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000013628{
13629 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
13630 Py_UNICODE *copy;
13631 Py_ssize_t size;
13632
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013633 if (!PyUnicode_Check(unicode)) {
13634 PyErr_BadArgument();
13635 return NULL;
13636 }
Victor Stinner71133ff2010-09-01 23:43:53 +000013637 /* Ensure we won't overflow the size. */
13638 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
13639 PyErr_NoMemory();
13640 return NULL;
13641 }
13642 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
13643 size *= sizeof(Py_UNICODE);
13644 copy = PyMem_Malloc(size);
13645 if (copy == NULL) {
13646 PyErr_NoMemory();
13647 return NULL;
13648 }
13649 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
13650 return copy;
13651}
Martin v. Löwis5b222132007-06-10 09:51:05 +000013652
Georg Brandl66c221e2010-10-14 07:04:07 +000013653/* A _string module, to export formatter_parser and formatter_field_name_split
13654 to the string.Formatter class implemented in Python. */
13655
13656static PyMethodDef _string_methods[] = {
13657 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
13658 METH_O, PyDoc_STR("split the argument as a field name")},
13659 {"formatter_parser", (PyCFunction) formatter_parser,
13660 METH_O, PyDoc_STR("parse the argument as a format string")},
13661 {NULL, NULL}
13662};
13663
13664static struct PyModuleDef _string_module = {
13665 PyModuleDef_HEAD_INIT,
13666 "_string",
13667 PyDoc_STR("string helper module"),
13668 0,
13669 _string_methods,
13670 NULL,
13671 NULL,
13672 NULL,
13673 NULL
13674};
13675
13676PyMODINIT_FUNC
13677PyInit__string(void)
13678{
13679 return PyModule_Create(&_string_module);
13680}
13681
13682
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013683#ifdef __cplusplus
13684}
13685#endif