blob: a64f795fe38c4c6ce44ab7ea2e3de258ca89e2da [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Victor Stinnerce5faf62011-10-05 00:42:43 +020049#ifdef Py_DEBUG
50# define DONT_MAKE_RESULT_READY
51#endif
52
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Victor Stinner910337b2011-10-03 03:20:16 +020096#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020097# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020098#else
99# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
100#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200101
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102#define _PyUnicode_UTF8(op) \
103 (((PyCompactUnicodeObject*)(op))->utf8)
104#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200105 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200106 assert(PyUnicode_IS_READY(op)), \
107 PyUnicode_IS_COMPACT_ASCII(op) ? \
108 ((char*)((PyASCIIObject*)(op) + 1)) : \
109 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200110#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200111 (((PyCompactUnicodeObject*)(op))->utf8_length)
112#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200113 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200114 assert(PyUnicode_IS_READY(op)), \
115 PyUnicode_IS_COMPACT_ASCII(op) ? \
116 ((PyASCIIObject*)(op))->length : \
117 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200118#define _PyUnicode_WSTR(op) \
119 (((PyASCIIObject*)(op))->wstr)
120#define _PyUnicode_WSTR_LENGTH(op) \
121 (((PyCompactUnicodeObject*)(op))->wstr_length)
122#define _PyUnicode_LENGTH(op) \
123 (((PyASCIIObject *)(op))->length)
124#define _PyUnicode_STATE(op) \
125 (((PyASCIIObject *)(op))->state)
126#define _PyUnicode_HASH(op) \
127 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200128#define _PyUnicode_KIND(op) \
129 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200130 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200131#define _PyUnicode_GET_LENGTH(op) \
132 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200133 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200134#define _PyUnicode_DATA_ANY(op) \
135 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200136
Victor Stinner910337b2011-10-03 03:20:16 +0200137#undef PyUnicode_READY
138#define PyUnicode_READY(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200141 0 : \
142 _PyUnicode_Ready((PyObject *)(op))))
Victor Stinner910337b2011-10-03 03:20:16 +0200143
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200144#define _PyUnicode_READY_REPLACE(p_obj) \
145 (assert(_PyUnicode_CHECK(*p_obj)), \
146 (PyUnicode_IS_READY(*p_obj) ? \
147 0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj))))
148
Victor Stinnerc379ead2011-10-03 12:52:27 +0200149#define _PyUnicode_SHARE_UTF8(op) \
150 (assert(_PyUnicode_CHECK(op)), \
151 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
152 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
153#define _PyUnicode_SHARE_WSTR(op) \
154 (assert(_PyUnicode_CHECK(op)), \
155 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
156
Victor Stinner829c0ad2011-10-03 01:08:02 +0200157/* true if the Unicode object has an allocated UTF-8 memory block
158 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200159#define _PyUnicode_HAS_UTF8_MEMORY(op) \
160 (assert(_PyUnicode_CHECK(op)), \
161 (!PyUnicode_IS_COMPACT_ASCII(op) \
162 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200163 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
164
Victor Stinner03490912011-10-03 23:45:12 +0200165/* true if the Unicode object has an allocated wstr memory block
166 (not shared with other data) */
167#define _PyUnicode_HAS_WSTR_MEMORY(op) \
168 (assert(_PyUnicode_CHECK(op)), \
169 (_PyUnicode_WSTR(op) && \
170 (!PyUnicode_IS_READY(op) || \
171 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
172
Victor Stinner910337b2011-10-03 03:20:16 +0200173/* Generic helper macro to convert characters of different types.
174 from_type and to_type have to be valid type names, begin and end
175 are pointers to the source characters which should be of type
176 "from_type *". to is a pointer of type "to_type *" and points to the
177 buffer where the result characters are written to. */
178#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
179 do { \
180 const from_type *iter_; to_type *to_; \
181 for (iter_ = (begin), to_ = (to_type *)(to); \
182 iter_ < (end); \
183 ++iter_, ++to_) { \
184 *to_ = (to_type)*iter_; \
185 } \
186 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200187
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200188/* The Unicode string has been modified: reset the hash */
189#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
190
Walter Dörwald16807132007-05-25 13:52:07 +0000191/* This dictionary holds all interned unicode strings. Note that references
192 to strings in this dictionary are *not* counted in the string's ob_refcnt.
193 When the interned string reaches a refcnt of 0 the string deallocation
194 function will delete the reference from this dictionary.
195
196 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000197 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000198*/
199static PyObject *interned;
200
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000201/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200202static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000203
204/* Single character Unicode strings in the Latin-1 range are being
205 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200206static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000207
Christian Heimes190d79e2008-01-30 11:58:22 +0000208/* Fast detection of the most frequent whitespace characters */
209const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000211/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000212/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000213/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000214/* case 0x000C: * FORM FEED */
215/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000216 0, 1, 1, 1, 1, 1, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000218/* case 0x001C: * FILE SEPARATOR */
219/* case 0x001D: * GROUP SEPARATOR */
220/* case 0x001E: * RECORD SEPARATOR */
221/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000222 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000223/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000224 1, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000228
Benjamin Peterson14339b62009-01-31 16:36:08 +0000229 0, 0, 0, 0, 0, 0, 0, 0,
230 0, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0,
232 0, 0, 0, 0, 0, 0, 0, 0,
233 0, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000237};
238
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200239/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200240static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200241static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200242static void copy_characters(
243 PyObject *to, Py_ssize_t to_start,
244 PyObject *from, Py_ssize_t from_start,
245 Py_ssize_t how_many);
Victor Stinnerc729b8e2011-10-06 02:36:59 +0200246#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200247static int unicode_is_singleton(PyObject *unicode);
Victor Stinnerc729b8e2011-10-06 02:36:59 +0200248#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200249
Alexander Belopolsky40018472011-02-26 01:02:56 +0000250static PyObject *
251unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000252 PyObject **errorHandler,const char *encoding, const char *reason,
253 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
254 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
255
Alexander Belopolsky40018472011-02-26 01:02:56 +0000256static void
257raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300258 const char *encoding,
259 const Py_UNICODE *unicode, Py_ssize_t size,
260 Py_ssize_t startpos, Py_ssize_t endpos,
261 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000262
Christian Heimes190d79e2008-01-30 11:58:22 +0000263/* Same for linebreaks */
264static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000265 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000266/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000267/* 0x000B, * LINE TABULATION */
268/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000269/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000270 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000271 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000272/* 0x001C, * FILE SEPARATOR */
273/* 0x001D, * GROUP SEPARATOR */
274/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000275 0, 0, 0, 0, 1, 1, 1, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
277 0, 0, 0, 0, 0, 0, 0, 0,
278 0, 0, 0, 0, 0, 0, 0, 0,
279 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000280
Benjamin Peterson14339b62009-01-31 16:36:08 +0000281 0, 0, 0, 0, 0, 0, 0, 0,
282 0, 0, 0, 0, 0, 0, 0, 0,
283 0, 0, 0, 0, 0, 0, 0, 0,
284 0, 0, 0, 0, 0, 0, 0, 0,
285 0, 0, 0, 0, 0, 0, 0, 0,
286 0, 0, 0, 0, 0, 0, 0, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000289};
290
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300291/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
292 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000293Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000294PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000295{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000296#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000297 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000298#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000299 /* This is actually an illegal character, so it should
300 not be passed to unichr. */
301 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000302#endif
303}
304
Victor Stinner910337b2011-10-03 03:20:16 +0200305#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200306int
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200307/* FIXME: use PyObject* type for op */
308_PyUnicode_CheckConsistency(void *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200309{
310 PyASCIIObject *ascii;
311 unsigned int kind;
312
313 assert(PyUnicode_Check(op));
314
315 ascii = (PyASCIIObject *)op;
316 kind = ascii->state.kind;
317
Victor Stinnera3b334d2011-10-03 13:53:37 +0200318 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200319 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200320 assert(ascii->state.ready == 1);
321 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200322 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200323 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200324 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200325
Victor Stinnera41463c2011-10-04 01:05:08 +0200326 if (ascii->state.compact == 1) {
327 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200328 assert(kind == PyUnicode_1BYTE_KIND
329 || kind == PyUnicode_2BYTE_KIND
330 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200331 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200332 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200333 assert (compact->utf8 != data);
334 } else {
335 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
336
337 data = unicode->data.any;
338 if (kind == PyUnicode_WCHAR_KIND) {
339 assert(ascii->state.compact == 0);
340 assert(ascii->state.ascii == 0);
341 assert(ascii->state.ready == 0);
342 assert(ascii->wstr != NULL);
343 assert(data == NULL);
344 assert(compact->utf8 == NULL);
345 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
346 }
347 else {
348 assert(kind == PyUnicode_1BYTE_KIND
349 || kind == PyUnicode_2BYTE_KIND
350 || kind == PyUnicode_4BYTE_KIND);
351 assert(ascii->state.compact == 0);
352 assert(ascii->state.ready == 1);
353 assert(data != NULL);
354 if (ascii->state.ascii) {
355 assert (compact->utf8 == data);
356 assert (compact->utf8_length == ascii->length);
357 }
358 else
359 assert (compact->utf8 != data);
360 }
361 }
362 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200363 if (
364#if SIZEOF_WCHAR_T == 2
365 kind == PyUnicode_2BYTE_KIND
366#else
367 kind == PyUnicode_4BYTE_KIND
368#endif
369 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200370 {
371 assert(ascii->wstr == data);
372 assert(compact->wstr_length == ascii->length);
373 } else
374 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200375 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200376
377 if (compact->utf8 == NULL)
378 assert(compact->utf8_length == 0);
379 if (ascii->wstr == NULL)
380 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200381 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200382 /* check that the best kind is used */
383 if (check_content && kind != PyUnicode_WCHAR_KIND)
384 {
385 Py_ssize_t i;
386 Py_UCS4 maxchar = 0;
387 void *data = PyUnicode_DATA(ascii);
388 for (i=0; i < ascii->length; i++)
389 {
390 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
391 if (ch > maxchar)
392 maxchar = ch;
393 }
394 if (kind == PyUnicode_1BYTE_KIND) {
395 if (ascii->state.ascii == 0)
396 assert(maxchar >= 128);
397 else
398 assert(maxchar < 128);
399 }
400 else if (kind == PyUnicode_2BYTE_KIND)
401 assert(maxchar >= 0x100);
402 else
403 assert(maxchar >= 0x10000);
404 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200405 if (check_content && !unicode_is_singleton((PyObject*)ascii))
406 assert(ascii->hash == -1);
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400407 return 1;
408}
Victor Stinner910337b2011-10-03 03:20:16 +0200409#endif
410
Thomas Wouters477c8d52006-05-27 19:21:47 +0000411/* --- Bloom Filters ----------------------------------------------------- */
412
413/* stuff to implement simple "bloom filters" for Unicode characters.
414 to keep things simple, we use a single bitmask, using the least 5
415 bits from each unicode characters as the bit index. */
416
417/* the linebreak mask is set up by Unicode_Init below */
418
Antoine Pitrouf068f942010-01-13 14:19:12 +0000419#if LONG_BIT >= 128
420#define BLOOM_WIDTH 128
421#elif LONG_BIT >= 64
422#define BLOOM_WIDTH 64
423#elif LONG_BIT >= 32
424#define BLOOM_WIDTH 32
425#else
426#error "LONG_BIT is smaller than 32"
427#endif
428
Thomas Wouters477c8d52006-05-27 19:21:47 +0000429#define BLOOM_MASK unsigned long
430
431static BLOOM_MASK bloom_linebreak;
432
Antoine Pitrouf068f942010-01-13 14:19:12 +0000433#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
434#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000435
Benjamin Peterson29060642009-01-31 22:14:21 +0000436#define BLOOM_LINEBREAK(ch) \
437 ((ch) < 128U ? ascii_linebreak[(ch)] : \
438 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000439
Alexander Belopolsky40018472011-02-26 01:02:56 +0000440Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200441make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000442{
443 /* calculate simple bloom-style bitmask for a given unicode string */
444
Antoine Pitrouf068f942010-01-13 14:19:12 +0000445 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000446 Py_ssize_t i;
447
448 mask = 0;
449 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200450 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000451
452 return mask;
453}
454
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200455#define BLOOM_MEMBER(mask, chr, str) \
456 (BLOOM(mask, chr) \
457 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000458
Guido van Rossumd57fd912000-03-10 22:53:23 +0000459/* --- Unicode Object ----------------------------------------------------- */
460
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200461static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200462fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200463
464Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
465 Py_ssize_t size, Py_UCS4 ch,
466 int direction)
467{
468 /* like wcschr, but doesn't stop at NULL characters */
469 Py_ssize_t i;
470 if (direction == 1) {
471 for(i = 0; i < size; i++)
472 if (PyUnicode_READ(kind, s, i) == ch)
473 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
474 }
475 else {
476 for(i = size-1; i >= 0; i--)
477 if (PyUnicode_READ(kind, s, i) == ch)
478 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
479 }
480 return NULL;
481}
482
Victor Stinnerfe226c02011-10-03 03:52:20 +0200483static PyObject*
484resize_compact(PyObject *unicode, Py_ssize_t length)
485{
486 Py_ssize_t char_size;
487 Py_ssize_t struct_size;
488 Py_ssize_t new_size;
489 int share_wstr;
490
491 assert(PyUnicode_IS_READY(unicode));
492 char_size = PyUnicode_CHARACTER_SIZE(unicode);
493 if (PyUnicode_IS_COMPACT_ASCII(unicode))
494 struct_size = sizeof(PyASCIIObject);
495 else
496 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200497 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200498
499 _Py_DEC_REFTOTAL;
500 _Py_ForgetReference(unicode);
501
502 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
503 PyErr_NoMemory();
504 return NULL;
505 }
506 new_size = (struct_size + (length + 1) * char_size);
507
508 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
509 if (unicode == NULL) {
510 PyObject_Del(unicode);
511 PyErr_NoMemory();
512 return NULL;
513 }
514 _Py_NewReference(unicode);
515 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200516 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200517 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200518 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
519 _PyUnicode_WSTR_LENGTH(unicode) = length;
520 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200521 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
522 length, 0);
523 return unicode;
524}
525
Alexander Belopolsky40018472011-02-26 01:02:56 +0000526static int
Victor Stinner95663112011-10-04 01:03:50 +0200527resize_inplace(PyUnicodeObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000528{
Victor Stinner95663112011-10-04 01:03:50 +0200529 wchar_t *wstr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200530 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200531 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000532
Victor Stinner95663112011-10-04 01:03:50 +0200533 _PyUnicode_DIRTY(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200534
535 if (PyUnicode_IS_READY(unicode)) {
536 Py_ssize_t char_size;
537 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200538 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200539 void *data;
540
541 data = _PyUnicode_DATA_ANY(unicode);
542 assert(data != NULL);
543 char_size = PyUnicode_CHARACTER_SIZE(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200544 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
545 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinner95663112011-10-04 01:03:50 +0200546 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
547 {
548 PyObject_DEL(_PyUnicode_UTF8(unicode));
549 _PyUnicode_UTF8(unicode) = NULL;
550 _PyUnicode_UTF8_LENGTH(unicode) = 0;
551 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200552
553 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
554 PyErr_NoMemory();
555 return -1;
556 }
557 new_size = (length + 1) * char_size;
558
559 data = (PyObject *)PyObject_REALLOC(data, new_size);
560 if (data == NULL) {
561 PyErr_NoMemory();
562 return -1;
563 }
564 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200565 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200566 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200567 _PyUnicode_WSTR_LENGTH(unicode) = length;
568 }
569 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200570 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200571 _PyUnicode_UTF8_LENGTH(unicode) = length;
572 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200573 _PyUnicode_LENGTH(unicode) = length;
574 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200575 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200576 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200577 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200578 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200579 }
Victor Stinner95663112011-10-04 01:03:50 +0200580 assert(_PyUnicode_WSTR(unicode) != NULL);
581
582 /* check for integer overflow */
583 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
584 PyErr_NoMemory();
585 return -1;
586 }
587 wstr = _PyUnicode_WSTR(unicode);
588 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
589 if (!wstr) {
590 PyErr_NoMemory();
591 return -1;
592 }
593 _PyUnicode_WSTR(unicode) = wstr;
594 _PyUnicode_WSTR(unicode)[length] = 0;
595 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200596 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000597 return 0;
598}
599
Victor Stinnerfe226c02011-10-03 03:52:20 +0200600static PyObject*
601resize_copy(PyObject *unicode, Py_ssize_t length)
602{
603 Py_ssize_t copy_length;
604 if (PyUnicode_IS_COMPACT(unicode)) {
605 PyObject *copy;
606 assert(PyUnicode_IS_READY(unicode));
607
608 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
609 if (copy == NULL)
610 return NULL;
611
612 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200613 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200614 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200615 }
616 else {
Victor Stinner2fd82272011-10-03 04:06:05 +0200617 PyUnicodeObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200618 assert(_PyUnicode_WSTR(unicode) != NULL);
619 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner2fd82272011-10-03 04:06:05 +0200620 w = _PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200621 if (w == NULL)
622 return NULL;
623 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
624 copy_length = Py_MIN(copy_length, length);
625 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
626 copy_length);
627 return (PyObject*)w;
628 }
629}
630
Guido van Rossumd57fd912000-03-10 22:53:23 +0000631/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000632 Ux0000 terminated; some code (e.g. new_identifier)
633 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000634
635 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000636 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000637
638*/
639
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200640#ifdef Py_DEBUG
641int unicode_old_new_calls = 0;
642#endif
643
Alexander Belopolsky40018472011-02-26 01:02:56 +0000644static PyUnicodeObject *
645_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000646{
647 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200648 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000649
Thomas Wouters477c8d52006-05-27 19:21:47 +0000650 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000651 if (length == 0 && unicode_empty != NULL) {
652 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200653 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000654 }
655
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000656 /* Ensure we won't overflow the size. */
657 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
658 return (PyUnicodeObject *)PyErr_NoMemory();
659 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200660 if (length < 0) {
661 PyErr_SetString(PyExc_SystemError,
662 "Negative size passed to _PyUnicode_New");
663 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000664 }
665
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200666#ifdef Py_DEBUG
667 ++unicode_old_new_calls;
668#endif
669
670 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
671 if (unicode == NULL)
672 return NULL;
673 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
674 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
675 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000676 PyErr_NoMemory();
677 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000678 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200679
Jeremy Hyltond8082792003-09-16 19:41:39 +0000680 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000681 * the caller fails before initializing str -- unicode_resize()
682 * reads str[0], and the Keep-Alive optimization can keep memory
683 * allocated for str alive across a call to unicode_dealloc(unicode).
684 * We don't want unicode_resize to read uninitialized memory in
685 * that case.
686 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200687 _PyUnicode_WSTR(unicode)[0] = 0;
688 _PyUnicode_WSTR(unicode)[length] = 0;
689 _PyUnicode_WSTR_LENGTH(unicode) = length;
690 _PyUnicode_HASH(unicode) = -1;
691 _PyUnicode_STATE(unicode).interned = 0;
692 _PyUnicode_STATE(unicode).kind = 0;
693 _PyUnicode_STATE(unicode).compact = 0;
694 _PyUnicode_STATE(unicode).ready = 0;
695 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200696 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200697 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200698 _PyUnicode_UTF8(unicode) = NULL;
699 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000700 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000701
Benjamin Peterson29060642009-01-31 22:14:21 +0000702 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000703 /* XXX UNREF/NEWREF interface should be more symmetrical */
704 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000705 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000706 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000707 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000708}
709
Victor Stinnerf42dc442011-10-02 23:33:16 +0200710static const char*
711unicode_kind_name(PyObject *unicode)
712{
Victor Stinner42dfd712011-10-03 14:41:45 +0200713 /* don't check consistency: unicode_kind_name() is called from
714 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200715 if (!PyUnicode_IS_COMPACT(unicode))
716 {
717 if (!PyUnicode_IS_READY(unicode))
718 return "wstr";
719 switch(PyUnicode_KIND(unicode))
720 {
721 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200722 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200723 return "legacy ascii";
724 else
725 return "legacy latin1";
726 case PyUnicode_2BYTE_KIND:
727 return "legacy UCS2";
728 case PyUnicode_4BYTE_KIND:
729 return "legacy UCS4";
730 default:
731 return "<legacy invalid kind>";
732 }
733 }
734 assert(PyUnicode_IS_READY(unicode));
735 switch(PyUnicode_KIND(unicode))
736 {
737 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200738 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200739 return "ascii";
740 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200741 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200742 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200743 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200744 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200745 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200746 default:
747 return "<invalid compact kind>";
748 }
749}
750
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200751#ifdef Py_DEBUG
752int unicode_new_new_calls = 0;
753
754/* Functions wrapping macros for use in debugger */
755char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200756 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200757}
758
759void *_PyUnicode_compact_data(void *unicode) {
760 return _PyUnicode_COMPACT_DATA(unicode);
761}
762void *_PyUnicode_data(void *unicode){
763 printf("obj %p\n", unicode);
764 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
765 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
766 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
767 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
768 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
769 return PyUnicode_DATA(unicode);
770}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200771
772void
773_PyUnicode_Dump(PyObject *op)
774{
775 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200776 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
777 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
778 void *data;
779 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
780 if (ascii->state.compact)
781 data = (compact + 1);
782 else
783 data = unicode->data.any;
784 if (ascii->wstr == data)
785 printf("shared ");
786 printf("wstr=%p", ascii->wstr);
Victor Stinnera3b334d2011-10-03 13:53:37 +0200787 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200788 printf(" (%zu), ", compact->wstr_length);
789 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
790 printf("shared ");
791 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200792 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200793 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200794}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200795#endif
796
797PyObject *
798PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
799{
800 PyObject *obj;
801 PyCompactUnicodeObject *unicode;
802 void *data;
803 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200804 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200805 Py_ssize_t char_size;
806 Py_ssize_t struct_size;
807
808 /* Optimization for empty strings */
809 if (size == 0 && unicode_empty != NULL) {
810 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200811 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200812 }
813
814#ifdef Py_DEBUG
815 ++unicode_new_new_calls;
816#endif
817
Victor Stinner9e9d6892011-10-04 01:02:02 +0200818 is_ascii = 0;
819 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200820 struct_size = sizeof(PyCompactUnicodeObject);
821 if (maxchar < 128) {
822 kind_state = PyUnicode_1BYTE_KIND;
823 char_size = 1;
824 is_ascii = 1;
825 struct_size = sizeof(PyASCIIObject);
826 }
827 else if (maxchar < 256) {
828 kind_state = PyUnicode_1BYTE_KIND;
829 char_size = 1;
830 }
831 else if (maxchar < 65536) {
832 kind_state = PyUnicode_2BYTE_KIND;
833 char_size = 2;
834 if (sizeof(wchar_t) == 2)
835 is_sharing = 1;
836 }
837 else {
838 kind_state = PyUnicode_4BYTE_KIND;
839 char_size = 4;
840 if (sizeof(wchar_t) == 4)
841 is_sharing = 1;
842 }
843
844 /* Ensure we won't overflow the size. */
845 if (size < 0) {
846 PyErr_SetString(PyExc_SystemError,
847 "Negative size passed to PyUnicode_New");
848 return NULL;
849 }
850 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
851 return PyErr_NoMemory();
852
853 /* Duplicated allocation code from _PyObject_New() instead of a call to
854 * PyObject_New() so we are able to allocate space for the object and
855 * it's data buffer.
856 */
857 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
858 if (obj == NULL)
859 return PyErr_NoMemory();
860 obj = PyObject_INIT(obj, &PyUnicode_Type);
861 if (obj == NULL)
862 return NULL;
863
864 unicode = (PyCompactUnicodeObject *)obj;
865 if (is_ascii)
866 data = ((PyASCIIObject*)obj) + 1;
867 else
868 data = unicode + 1;
869 _PyUnicode_LENGTH(unicode) = size;
870 _PyUnicode_HASH(unicode) = -1;
871 _PyUnicode_STATE(unicode).interned = 0;
872 _PyUnicode_STATE(unicode).kind = kind_state;
873 _PyUnicode_STATE(unicode).compact = 1;
874 _PyUnicode_STATE(unicode).ready = 1;
875 _PyUnicode_STATE(unicode).ascii = is_ascii;
876 if (is_ascii) {
877 ((char*)data)[size] = 0;
878 _PyUnicode_WSTR(unicode) = NULL;
879 }
880 else if (kind_state == PyUnicode_1BYTE_KIND) {
881 ((char*)data)[size] = 0;
882 _PyUnicode_WSTR(unicode) = NULL;
883 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200884 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200885 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200886 }
887 else {
888 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200889 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200890 if (kind_state == PyUnicode_2BYTE_KIND)
891 ((Py_UCS2*)data)[size] = 0;
892 else /* kind_state == PyUnicode_4BYTE_KIND */
893 ((Py_UCS4*)data)[size] = 0;
894 if (is_sharing) {
895 _PyUnicode_WSTR_LENGTH(unicode) = size;
896 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
897 }
898 else {
899 _PyUnicode_WSTR_LENGTH(unicode) = 0;
900 _PyUnicode_WSTR(unicode) = NULL;
901 }
902 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200903 assert(_PyUnicode_CheckConsistency(unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200904 return obj;
905}
906
907#if SIZEOF_WCHAR_T == 2
908/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
909 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +0200910 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200911
912 This function assumes that unicode can hold one more code point than wstr
913 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +0200914static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200915unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
916 PyUnicodeObject *unicode)
917{
918 const wchar_t *iter;
919 Py_UCS4 *ucs4_out;
920
Victor Stinner910337b2011-10-03 03:20:16 +0200921 assert(unicode != NULL);
922 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200923 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
924 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
925
926 for (iter = begin; iter < end; ) {
927 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
928 _PyUnicode_GET_LENGTH(unicode)));
929 if (*iter >= 0xD800 && *iter <= 0xDBFF
930 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
931 {
932 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
933 iter += 2;
934 }
935 else {
936 *ucs4_out++ = *iter;
937 iter++;
938 }
939 }
940 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
941 _PyUnicode_GET_LENGTH(unicode)));
942
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200943}
944#endif
945
Victor Stinnercd9950f2011-10-02 00:34:53 +0200946static int
947_PyUnicode_Dirty(PyObject *unicode)
948{
Victor Stinner910337b2011-10-03 03:20:16 +0200949 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +0200950 if (Py_REFCNT(unicode) != 1) {
Victor Stinner01698042011-10-04 00:04:26 +0200951 PyErr_SetString(PyExc_SystemError,
Victor Stinnercd9950f2011-10-02 00:34:53 +0200952 "Cannot modify a string having more than 1 reference");
953 return -1;
954 }
955 _PyUnicode_DIRTY(unicode);
956 return 0;
957}
958
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200959static int
960_copy_characters(PyObject *to, Py_ssize_t to_start,
961 PyObject *from, Py_ssize_t from_start,
962 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200963{
Victor Stinnera0702ab2011-09-29 14:14:38 +0200964 unsigned int from_kind, to_kind;
965 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200966 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200967
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200968 assert(PyUnicode_Check(from));
969 assert(PyUnicode_Check(to));
970 assert(PyUnicode_IS_READY(from));
971 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200972
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200973 assert(PyUnicode_GET_LENGTH(from) >= how_many);
974 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
975 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200976
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200977 if (how_many == 0)
978 return 0;
979
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200980 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200981 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200982 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200983 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200984
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200985#ifdef Py_DEBUG
986 if (!check_maxchar
987 && (from_kind > to_kind
988 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200989 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200990 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
991 Py_UCS4 ch;
992 Py_ssize_t i;
993 for (i=0; i < how_many; i++) {
994 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
995 assert(ch <= to_maxchar);
996 }
997 }
998#endif
999 fast = (from_kind == to_kind);
1000 if (check_maxchar
1001 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1002 {
1003 /* deny latin1 => ascii */
1004 fast = 0;
1005 }
1006
1007 if (fast) {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001008 Py_MEMCPY((char*)to_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001009 + PyUnicode_KIND_SIZE(to_kind, to_start),
Victor Stinnera0702ab2011-09-29 14:14:38 +02001010 (char*)from_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001011 + PyUnicode_KIND_SIZE(from_kind, from_start),
1012 PyUnicode_KIND_SIZE(to_kind, how_many));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001013 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001014 else if (from_kind == PyUnicode_1BYTE_KIND
1015 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001016 {
1017 _PyUnicode_CONVERT_BYTES(
1018 Py_UCS1, Py_UCS2,
1019 PyUnicode_1BYTE_DATA(from) + from_start,
1020 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1021 PyUnicode_2BYTE_DATA(to) + to_start
1022 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001023 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001024 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001025 && to_kind == PyUnicode_4BYTE_KIND)
1026 {
1027 _PyUnicode_CONVERT_BYTES(
1028 Py_UCS1, Py_UCS4,
1029 PyUnicode_1BYTE_DATA(from) + from_start,
1030 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1031 PyUnicode_4BYTE_DATA(to) + to_start
1032 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001033 }
1034 else if (from_kind == PyUnicode_2BYTE_KIND
1035 && to_kind == PyUnicode_4BYTE_KIND)
1036 {
1037 _PyUnicode_CONVERT_BYTES(
1038 Py_UCS2, Py_UCS4,
1039 PyUnicode_2BYTE_DATA(from) + from_start,
1040 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1041 PyUnicode_4BYTE_DATA(to) + to_start
1042 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001043 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001044 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001045 /* check if max_char(from substring) <= max_char(to) */
1046 if (from_kind > to_kind
1047 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001048 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001049 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001050 /* slow path to check for character overflow */
1051 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001052 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001053 Py_ssize_t i;
1054
Victor Stinner56c161a2011-10-06 02:47:11 +02001055#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001056 for (i=0; i < how_many; i++) {
1057 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001058 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001059 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1060 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001061#else
1062 if (!check_maxchar) {
1063 for (i=0; i < how_many; i++) {
1064 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1065 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1066 }
1067 }
1068 else {
1069 for (i=0; i < how_many; i++) {
1070 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1071 if (ch > to_maxchar)
1072 return 1;
1073 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1074 }
1075 }
1076#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001077 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001078 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001079 assert(0 && "inconsistent state");
1080 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001081 }
1082 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001083 return 0;
1084}
1085
1086static void
1087copy_characters(PyObject *to, Py_ssize_t to_start,
1088 PyObject *from, Py_ssize_t from_start,
1089 Py_ssize_t how_many)
1090{
1091 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1092}
1093
1094Py_ssize_t
1095PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1096 PyObject *from, Py_ssize_t from_start,
1097 Py_ssize_t how_many)
1098{
1099 int err;
1100
1101 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1102 PyErr_BadInternalCall();
1103 return -1;
1104 }
1105
1106 if (PyUnicode_READY(from))
1107 return -1;
1108 if (PyUnicode_READY(to))
1109 return -1;
1110
1111 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1112 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1113 PyErr_Format(PyExc_SystemError,
1114 "Cannot write %zi characters at %zi "
1115 "in a string of %zi characters",
1116 how_many, to_start, PyUnicode_GET_LENGTH(to));
1117 return -1;
1118 }
1119
1120 if (how_many == 0)
1121 return 0;
1122
1123 if (_PyUnicode_Dirty(to))
1124 return -1;
1125
1126 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1127 if (err) {
1128 PyErr_Format(PyExc_SystemError,
1129 "Cannot copy %s characters "
1130 "into a string of %s characters",
1131 unicode_kind_name(from),
1132 unicode_kind_name(to));
1133 return -1;
1134 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001135 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001136}
1137
Victor Stinner17222162011-09-28 22:15:37 +02001138/* Find the maximum code point and count the number of surrogate pairs so a
1139 correct string length can be computed before converting a string to UCS4.
1140 This function counts single surrogates as a character and not as a pair.
1141
1142 Return 0 on success, or -1 on error. */
1143static int
1144find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1145 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001146{
1147 const wchar_t *iter;
1148
Victor Stinnerc53be962011-10-02 21:33:54 +02001149 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001150 *num_surrogates = 0;
1151 *maxchar = 0;
1152
1153 for (iter = begin; iter < end; ) {
Victor Stinnerae864852011-10-05 14:02:44 +02001154 if (*iter > *maxchar) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001155 *maxchar = *iter;
Victor Stinnerae864852011-10-05 14:02:44 +02001156#if SIZEOF_WCHAR_T != 2
1157 if (*maxchar >= 0x10000)
1158 return 0;
1159#endif
1160 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001161#if SIZEOF_WCHAR_T == 2
1162 if (*iter >= 0xD800 && *iter <= 0xDBFF
1163 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1164 {
1165 Py_UCS4 surrogate_val;
1166 surrogate_val = (((iter[0] & 0x3FF)<<10)
1167 | (iter[1] & 0x3FF)) + 0x10000;
1168 ++(*num_surrogates);
1169 if (surrogate_val > *maxchar)
1170 *maxchar = surrogate_val;
1171 iter += 2;
1172 }
1173 else
1174 iter++;
1175#else
1176 iter++;
1177#endif
1178 }
1179 return 0;
1180}
1181
1182#ifdef Py_DEBUG
1183int unicode_ready_calls = 0;
1184#endif
1185
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001186static int
1187unicode_ready(PyObject **p_obj, int replace)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001188{
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001189 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001190 wchar_t *end;
1191 Py_UCS4 maxchar = 0;
1192 Py_ssize_t num_surrogates;
1193#if SIZEOF_WCHAR_T == 2
1194 Py_ssize_t length_wo_surrogates;
1195#endif
1196
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001197 assert(p_obj != NULL);
1198 unicode = (PyUnicodeObject *)*p_obj;
1199
Georg Brandl7597add2011-10-05 16:36:47 +02001200 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001201 strings were created using _PyObject_New() and where no canonical
1202 representation (the str field) has been set yet aka strings
1203 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001204 assert(_PyUnicode_CHECK(unicode));
1205 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001206 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001207 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001208 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001209 /* Actually, it should neither be interned nor be anything else: */
1210 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001211
1212#ifdef Py_DEBUG
1213 ++unicode_ready_calls;
1214#endif
1215
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001216#ifdef Py_DEBUG
1217 assert(!replace || Py_REFCNT(unicode) == 1);
1218#else
1219 if (replace && Py_REFCNT(unicode) != 1)
1220 replace = 0;
1221#endif
1222 if (replace) {
1223 Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode);
1224 wchar_t *wstr = _PyUnicode_WSTR(unicode);
1225 /* Optimization for empty strings */
1226 if (len == 0) {
1227 Py_INCREF(unicode_empty);
1228 Py_DECREF(*p_obj);
1229 *p_obj = unicode_empty;
1230 return 0;
1231 }
1232 if (len == 1 && wstr[0] < 256) {
1233 PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]);
1234 if (latin1_char == NULL)
1235 return -1;
1236 Py_DECREF(*p_obj);
1237 *p_obj = latin1_char;
1238 return 0;
1239 }
1240 }
1241
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001242 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001243 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001244 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001245 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001246
1247 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001248 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1249 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001250 PyErr_NoMemory();
1251 return -1;
1252 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001253 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001254 _PyUnicode_WSTR(unicode), end,
1255 PyUnicode_1BYTE_DATA(unicode));
1256 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1257 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1258 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1259 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001260 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001261 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001262 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001263 }
1264 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001265 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001266 _PyUnicode_UTF8(unicode) = NULL;
1267 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001268 }
1269 PyObject_FREE(_PyUnicode_WSTR(unicode));
1270 _PyUnicode_WSTR(unicode) = NULL;
1271 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1272 }
1273 /* In this case we might have to convert down from 4-byte native
1274 wchar_t to 2-byte unicode. */
1275 else if (maxchar < 65536) {
1276 assert(num_surrogates == 0 &&
1277 "FindMaxCharAndNumSurrogatePairs() messed up");
1278
Victor Stinner506f5922011-09-28 22:34:18 +02001279#if SIZEOF_WCHAR_T == 2
1280 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001281 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001282 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1283 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1284 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001285 _PyUnicode_UTF8(unicode) = NULL;
1286 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001287#else
1288 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001289 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001290 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001291 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001292 PyErr_NoMemory();
1293 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001294 }
Victor Stinner506f5922011-09-28 22:34:18 +02001295 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1296 _PyUnicode_WSTR(unicode), end,
1297 PyUnicode_2BYTE_DATA(unicode));
1298 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1299 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1300 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001301 _PyUnicode_UTF8(unicode) = NULL;
1302 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001303 PyObject_FREE(_PyUnicode_WSTR(unicode));
1304 _PyUnicode_WSTR(unicode) = NULL;
1305 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1306#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001307 }
1308 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1309 else {
1310#if SIZEOF_WCHAR_T == 2
1311 /* in case the native representation is 2-bytes, we need to allocate a
1312 new normalized 4-byte version. */
1313 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001314 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1315 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001316 PyErr_NoMemory();
1317 return -1;
1318 }
1319 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1320 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001321 _PyUnicode_UTF8(unicode) = NULL;
1322 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001323 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1324 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001325 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001326 PyObject_FREE(_PyUnicode_WSTR(unicode));
1327 _PyUnicode_WSTR(unicode) = NULL;
1328 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1329#else
1330 assert(num_surrogates == 0);
1331
Victor Stinnerc3c74152011-10-02 20:39:55 +02001332 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001333 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001334 _PyUnicode_UTF8(unicode) = NULL;
1335 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001336 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1337#endif
1338 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1339 }
1340 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001341 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001342 return 0;
1343}
1344
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001345int
1346_PyUnicode_ReadyReplace(PyObject **op)
1347{
1348 return unicode_ready(op, 1);
1349}
1350
1351int
1352_PyUnicode_Ready(PyObject *op)
1353{
1354 return unicode_ready(&op, 0);
1355}
1356
Alexander Belopolsky40018472011-02-26 01:02:56 +00001357static void
1358unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001359{
Walter Dörwald16807132007-05-25 13:52:07 +00001360 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001361 case SSTATE_NOT_INTERNED:
1362 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001363
Benjamin Peterson29060642009-01-31 22:14:21 +00001364 case SSTATE_INTERNED_MORTAL:
1365 /* revive dead object temporarily for DelItem */
1366 Py_REFCNT(unicode) = 3;
1367 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
1368 Py_FatalError(
1369 "deletion of interned string failed");
1370 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001371
Benjamin Peterson29060642009-01-31 22:14:21 +00001372 case SSTATE_INTERNED_IMMORTAL:
1373 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001374
Benjamin Peterson29060642009-01-31 22:14:21 +00001375 default:
1376 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001377 }
1378
Victor Stinner03490912011-10-03 23:45:12 +02001379 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001380 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001381 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001382 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001383
1384 if (PyUnicode_IS_COMPACT(unicode)) {
1385 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001386 }
1387 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001388 if (_PyUnicode_DATA_ANY(unicode))
1389 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Benjamin Peterson29060642009-01-31 22:14:21 +00001390 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001391 }
1392}
1393
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001394#ifdef Py_DEBUG
1395static int
1396unicode_is_singleton(PyObject *unicode)
1397{
1398 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1399 if (unicode == unicode_empty)
1400 return 1;
1401 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1402 {
1403 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1404 if (ch < 256 && unicode_latin1[ch] == unicode)
1405 return 1;
1406 }
1407 return 0;
1408}
1409#endif
1410
Alexander Belopolsky40018472011-02-26 01:02:56 +00001411static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001412unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001413{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001414 if (Py_REFCNT(unicode) != 1)
1415 return 0;
1416 if (PyUnicode_CHECK_INTERNED(unicode))
1417 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001418#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001419 /* singleton refcount is greater than 1 */
1420 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001421#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001422 return 1;
1423}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001424
Victor Stinnerfe226c02011-10-03 03:52:20 +02001425static int
1426unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1427{
1428 PyObject *unicode;
1429 Py_ssize_t old_length;
1430
1431 assert(p_unicode != NULL);
1432 unicode = *p_unicode;
1433
1434 assert(unicode != NULL);
1435 assert(PyUnicode_Check(unicode));
1436 assert(0 <= length);
1437
Victor Stinner910337b2011-10-03 03:20:16 +02001438 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001439 old_length = PyUnicode_WSTR_LENGTH(unicode);
1440 else
1441 old_length = PyUnicode_GET_LENGTH(unicode);
1442 if (old_length == length)
1443 return 0;
1444
Victor Stinnerfe226c02011-10-03 03:52:20 +02001445 if (!unicode_resizable(unicode)) {
1446 PyObject *copy = resize_copy(unicode, length);
1447 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001448 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001449 Py_DECREF(*p_unicode);
1450 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001451 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001452 }
1453
Victor Stinnerfe226c02011-10-03 03:52:20 +02001454 if (PyUnicode_IS_COMPACT(unicode)) {
1455 *p_unicode = resize_compact(unicode, length);
1456 if (*p_unicode == NULL)
1457 return -1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001458 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001459 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001460 }
1461 return resize_inplace((PyUnicodeObject*)unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001462}
1463
Alexander Belopolsky40018472011-02-26 01:02:56 +00001464int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001465PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001466{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001467 PyObject *unicode;
1468 if (p_unicode == NULL) {
1469 PyErr_BadInternalCall();
1470 return -1;
1471 }
1472 unicode = *p_unicode;
1473 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0
1474 || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND)
1475 {
1476 PyErr_BadInternalCall();
1477 return -1;
1478 }
1479 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001480}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001481
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001482static PyObject*
1483get_latin1_char(unsigned char ch)
1484{
Victor Stinnera464fc12011-10-02 20:39:30 +02001485 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001486 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001487 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001488 if (!unicode)
1489 return NULL;
1490 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001491 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001492 unicode_latin1[ch] = unicode;
1493 }
1494 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001495 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001496}
1497
Alexander Belopolsky40018472011-02-26 01:02:56 +00001498PyObject *
1499PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001500{
1501 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001502 Py_UCS4 maxchar = 0;
1503 Py_ssize_t num_surrogates;
1504
1505 if (u == NULL)
1506 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001507
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001508 /* If the Unicode data is known at construction time, we can apply
1509 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001510
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001511 /* Optimization for empty strings */
1512 if (size == 0 && unicode_empty != NULL) {
1513 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001514 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001515 }
Tim Petersced69f82003-09-16 20:30:58 +00001516
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001517 /* Single character Unicode objects in the Latin-1 range are
1518 shared when using this constructor */
1519 if (size == 1 && *u < 256)
1520 return get_latin1_char((unsigned char)*u);
1521
1522 /* If not empty and not single character, copy the Unicode data
1523 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001524 if (find_maxchar_surrogates(u, u + size,
1525 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001526 return NULL;
1527
1528 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1529 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001530 if (!unicode)
1531 return NULL;
1532
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001533 switch (PyUnicode_KIND(unicode)) {
1534 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001535 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001536 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1537 break;
1538 case PyUnicode_2BYTE_KIND:
1539#if Py_UNICODE_SIZE == 2
1540 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1541#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001542 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001543 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1544#endif
1545 break;
1546 case PyUnicode_4BYTE_KIND:
1547#if SIZEOF_WCHAR_T == 2
1548 /* This is the only case which has to process surrogates, thus
1549 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001550 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001551#else
1552 assert(num_surrogates == 0);
1553 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1554#endif
1555 break;
1556 default:
1557 assert(0 && "Impossible state");
1558 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001559
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001560 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001561 return (PyObject *)unicode;
1562}
1563
Alexander Belopolsky40018472011-02-26 01:02:56 +00001564PyObject *
1565PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001566{
1567 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001568
Benjamin Peterson14339b62009-01-31 16:36:08 +00001569 if (size < 0) {
1570 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001571 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001572 return NULL;
1573 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001574
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001575 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001576 some optimizations which share commonly used objects.
1577 Also, this means the input must be UTF-8, so fall back to the
1578 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001579 if (u != NULL) {
1580
Benjamin Peterson29060642009-01-31 22:14:21 +00001581 /* Optimization for empty strings */
1582 if (size == 0 && unicode_empty != NULL) {
1583 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001584 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001585 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001586
1587 /* Single characters are shared when using this constructor.
1588 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001589 if (size == 1 && Py_CHARMASK(*u) < 128)
1590 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001591
1592 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001593 }
1594
Walter Dörwald55507312007-05-18 13:12:10 +00001595 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001596 if (!unicode)
1597 return NULL;
1598
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001599 return (PyObject *)unicode;
1600}
1601
Alexander Belopolsky40018472011-02-26 01:02:56 +00001602PyObject *
1603PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001604{
1605 size_t size = strlen(u);
1606 if (size > PY_SSIZE_T_MAX) {
1607 PyErr_SetString(PyExc_OverflowError, "input too long");
1608 return NULL;
1609 }
1610
1611 return PyUnicode_FromStringAndSize(u, size);
1612}
1613
Victor Stinnere57b1c02011-09-28 22:20:48 +02001614static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001615unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001616{
Victor Stinner0617b6e2011-10-05 23:26:01 +02001617 PyObject *res;
1618#ifdef Py_DEBUG
1619 const unsigned char *p;
1620 const unsigned char *end = s + size;
1621 for (p=s; p < end; p++) {
1622 assert(*p < 128);
1623 }
1624#endif
1625 res = PyUnicode_New(size, 127);
Victor Stinner702c7342011-10-05 13:50:52 +02001626 if (!res)
1627 return NULL;
Victor Stinner0617b6e2011-10-05 23:26:01 +02001628 memcpy(PyUnicode_1BYTE_DATA(res), s, size);
Victor Stinner702c7342011-10-05 13:50:52 +02001629 return res;
1630}
1631
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001632static Py_UCS4
1633kind_maxchar_limit(unsigned int kind)
1634{
1635 switch(kind) {
1636 case PyUnicode_1BYTE_KIND:
1637 return 0x80;
1638 case PyUnicode_2BYTE_KIND:
1639 return 0x100;
1640 case PyUnicode_4BYTE_KIND:
1641 return 0x10000;
1642 default:
1643 assert(0 && "invalid kind");
1644 return 0x10ffff;
1645 }
1646}
1647
Victor Stinner702c7342011-10-05 13:50:52 +02001648static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001649_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001650{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001651 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001652 unsigned char max_char = 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001653 Py_ssize_t i;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001654
1655 assert(size >= 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001656 for (i = 0; i < size; i++) {
1657 if (u[i] & 0x80) {
Victor Stinnerb9275c12011-10-05 14:01:42 +02001658 max_char = 255;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001659 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001660 }
1661 }
Victor Stinnerb9275c12011-10-05 14:01:42 +02001662 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001663 if (!res)
1664 return NULL;
1665 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001666 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001667 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001668}
1669
Victor Stinnere57b1c02011-09-28 22:20:48 +02001670static PyObject*
1671_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001672{
1673 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001674 Py_UCS2 max_char = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001675 Py_ssize_t i;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001676
1677 assert(size >= 0);
1678 for (i = 0; i < size; i++) {
1679 if (u[i] > max_char) {
1680 max_char = u[i];
1681 if (max_char >= 256)
1682 break;
1683 }
1684 }
1685 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001686 if (!res)
1687 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001688 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001689 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1690 else
1691 for (i = 0; i < size; i++)
1692 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001693 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001694 return res;
1695}
1696
Victor Stinnere57b1c02011-09-28 22:20:48 +02001697static PyObject*
1698_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001699{
1700 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001701 Py_UCS4 max_char = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001702 Py_ssize_t i;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001703
1704 assert(size >= 0);
1705 for (i = 0; i < size; i++) {
1706 if (u[i] > max_char) {
1707 max_char = u[i];
1708 if (max_char >= 0x10000)
1709 break;
1710 }
1711 }
1712 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001713 if (!res)
1714 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001715 if (max_char >= 0x10000)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001716 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1717 else {
1718 int kind = PyUnicode_KIND(res);
1719 void *data = PyUnicode_DATA(res);
1720 for (i = 0; i < size; i++)
1721 PyUnicode_WRITE(kind, data, i, u[i]);
1722 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001723 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001724 return res;
1725}
1726
1727PyObject*
1728PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1729{
1730 switch(kind) {
1731 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001732 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001733 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001734 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001735 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001736 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001737 default:
1738 assert(0 && "invalid kind");
1739 PyErr_SetString(PyExc_SystemError, "invalid kind");
1740 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001741 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001742}
1743
Victor Stinner034f6cf2011-09-30 02:26:44 +02001744PyObject*
1745PyUnicode_Copy(PyObject *unicode)
1746{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001747 Py_ssize_t size;
1748 PyObject *copy;
1749 void *data;
1750
Victor Stinner034f6cf2011-09-30 02:26:44 +02001751 if (!PyUnicode_Check(unicode)) {
1752 PyErr_BadInternalCall();
1753 return NULL;
1754 }
1755 if (PyUnicode_READY(unicode))
1756 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001757
1758 size = PyUnicode_GET_LENGTH(unicode);
1759 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1760 if (!copy)
1761 return NULL;
1762 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1763
1764 data = PyUnicode_DATA(unicode);
1765 switch (PyUnicode_KIND(unicode))
1766 {
1767 case PyUnicode_1BYTE_KIND:
1768 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1769 break;
1770 case PyUnicode_2BYTE_KIND:
1771 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1772 break;
1773 case PyUnicode_4BYTE_KIND:
1774 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1775 break;
1776 default:
1777 assert(0);
1778 break;
1779 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001780 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001781 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001782}
1783
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001784
Victor Stinnerbc603d12011-10-02 01:00:40 +02001785/* Widen Unicode objects to larger buffers. Don't write terminating null
1786 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001787
1788void*
1789_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1790{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001791 Py_ssize_t len;
1792 void *result;
1793 unsigned int skind;
1794
1795 if (PyUnicode_READY(s))
1796 return NULL;
1797
1798 len = PyUnicode_GET_LENGTH(s);
1799 skind = PyUnicode_KIND(s);
1800 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02001801 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001802 return NULL;
1803 }
1804 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001805 case PyUnicode_2BYTE_KIND:
1806 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1807 if (!result)
1808 return PyErr_NoMemory();
1809 assert(skind == PyUnicode_1BYTE_KIND);
1810 _PyUnicode_CONVERT_BYTES(
1811 Py_UCS1, Py_UCS2,
1812 PyUnicode_1BYTE_DATA(s),
1813 PyUnicode_1BYTE_DATA(s) + len,
1814 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001815 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001816 case PyUnicode_4BYTE_KIND:
1817 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1818 if (!result)
1819 return PyErr_NoMemory();
1820 if (skind == PyUnicode_2BYTE_KIND) {
1821 _PyUnicode_CONVERT_BYTES(
1822 Py_UCS2, Py_UCS4,
1823 PyUnicode_2BYTE_DATA(s),
1824 PyUnicode_2BYTE_DATA(s) + len,
1825 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001826 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001827 else {
1828 assert(skind == PyUnicode_1BYTE_KIND);
1829 _PyUnicode_CONVERT_BYTES(
1830 Py_UCS1, Py_UCS4,
1831 PyUnicode_1BYTE_DATA(s),
1832 PyUnicode_1BYTE_DATA(s) + len,
1833 result);
1834 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001835 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001836 default:
1837 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001838 }
Victor Stinner01698042011-10-04 00:04:26 +02001839 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001840 return NULL;
1841}
1842
1843static Py_UCS4*
1844as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1845 int copy_null)
1846{
1847 int kind;
1848 void *data;
1849 Py_ssize_t len, targetlen;
1850 if (PyUnicode_READY(string) == -1)
1851 return NULL;
1852 kind = PyUnicode_KIND(string);
1853 data = PyUnicode_DATA(string);
1854 len = PyUnicode_GET_LENGTH(string);
1855 targetlen = len;
1856 if (copy_null)
1857 targetlen++;
1858 if (!target) {
1859 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1860 PyErr_NoMemory();
1861 return NULL;
1862 }
1863 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1864 if (!target) {
1865 PyErr_NoMemory();
1866 return NULL;
1867 }
1868 }
1869 else {
1870 if (targetsize < targetlen) {
1871 PyErr_Format(PyExc_SystemError,
1872 "string is longer than the buffer");
1873 if (copy_null && 0 < targetsize)
1874 target[0] = 0;
1875 return NULL;
1876 }
1877 }
1878 if (kind != PyUnicode_4BYTE_KIND) {
1879 Py_ssize_t i;
1880 for (i = 0; i < len; i++)
1881 target[i] = PyUnicode_READ(kind, data, i);
1882 }
1883 else
1884 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1885 if (copy_null)
1886 target[len] = 0;
1887 return target;
1888}
1889
1890Py_UCS4*
1891PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1892 int copy_null)
1893{
1894 if (target == NULL || targetsize < 1) {
1895 PyErr_BadInternalCall();
1896 return NULL;
1897 }
1898 return as_ucs4(string, target, targetsize, copy_null);
1899}
1900
1901Py_UCS4*
1902PyUnicode_AsUCS4Copy(PyObject *string)
1903{
1904 return as_ucs4(string, NULL, 0, 1);
1905}
1906
1907#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00001908
Alexander Belopolsky40018472011-02-26 01:02:56 +00001909PyObject *
1910PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001911{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001912 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00001913 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001914 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001915 PyErr_BadInternalCall();
1916 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001917 }
1918
Martin v. Löwis790465f2008-04-05 20:41:37 +00001919 if (size == -1) {
1920 size = wcslen(w);
1921 }
1922
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001923 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001924}
1925
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001926#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00001927
Walter Dörwald346737f2007-05-31 10:44:43 +00001928static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001929makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1930 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00001931{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001932 *fmt++ = '%';
1933 if (width) {
1934 if (zeropad)
1935 *fmt++ = '0';
1936 fmt += sprintf(fmt, "%d", width);
1937 }
1938 if (precision)
1939 fmt += sprintf(fmt, ".%d", precision);
1940 if (longflag)
1941 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001942 else if (longlongflag) {
1943 /* longlongflag should only ever be nonzero on machines with
1944 HAVE_LONG_LONG defined */
1945#ifdef HAVE_LONG_LONG
1946 char *f = PY_FORMAT_LONG_LONG;
1947 while (*f)
1948 *fmt++ = *f++;
1949#else
1950 /* we shouldn't ever get here */
1951 assert(0);
1952 *fmt++ = 'l';
1953#endif
1954 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001955 else if (size_tflag) {
1956 char *f = PY_FORMAT_SIZE_T;
1957 while (*f)
1958 *fmt++ = *f++;
1959 }
1960 *fmt++ = c;
1961 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00001962}
1963
Victor Stinner96865452011-03-01 23:44:09 +00001964/* helper for PyUnicode_FromFormatV() */
1965
1966static const char*
1967parse_format_flags(const char *f,
1968 int *p_width, int *p_precision,
1969 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1970{
1971 int width, precision, longflag, longlongflag, size_tflag;
1972
1973 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1974 f++;
1975 width = 0;
1976 while (Py_ISDIGIT((unsigned)*f))
1977 width = (width*10) + *f++ - '0';
1978 precision = 0;
1979 if (*f == '.') {
1980 f++;
1981 while (Py_ISDIGIT((unsigned)*f))
1982 precision = (precision*10) + *f++ - '0';
1983 if (*f == '%') {
1984 /* "%.3%s" => f points to "3" */
1985 f--;
1986 }
1987 }
1988 if (*f == '\0') {
1989 /* bogus format "%.1" => go backward, f points to "1" */
1990 f--;
1991 }
1992 if (p_width != NULL)
1993 *p_width = width;
1994 if (p_precision != NULL)
1995 *p_precision = precision;
1996
1997 /* Handle %ld, %lu, %lld and %llu. */
1998 longflag = 0;
1999 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002000 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002001
2002 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002003 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002004 longflag = 1;
2005 ++f;
2006 }
2007#ifdef HAVE_LONG_LONG
2008 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002009 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002010 longlongflag = 1;
2011 f += 2;
2012 }
2013#endif
2014 }
2015 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002016 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002017 size_tflag = 1;
2018 ++f;
2019 }
2020 if (p_longflag != NULL)
2021 *p_longflag = longflag;
2022 if (p_longlongflag != NULL)
2023 *p_longlongflag = longlongflag;
2024 if (p_size_tflag != NULL)
2025 *p_size_tflag = size_tflag;
2026 return f;
2027}
2028
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002029/* maximum number of characters required for output of %ld. 21 characters
2030 allows for 64-bit integers (in decimal) and an optional sign. */
2031#define MAX_LONG_CHARS 21
2032/* maximum number of characters required for output of %lld.
2033 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2034 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2035#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2036
Walter Dörwaldd2034312007-05-18 16:29:38 +00002037PyObject *
2038PyUnicode_FromFormatV(const char *format, va_list vargs)
2039{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002040 va_list count;
2041 Py_ssize_t callcount = 0;
2042 PyObject **callresults = NULL;
2043 PyObject **callresult = NULL;
2044 Py_ssize_t n = 0;
2045 int width = 0;
2046 int precision = 0;
2047 int zeropad;
2048 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002049 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002050 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002051 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002052 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2053 Py_UCS4 argmaxchar;
2054 Py_ssize_t numbersize = 0;
2055 char *numberresults = NULL;
2056 char *numberresult = NULL;
2057 Py_ssize_t i;
2058 int kind;
2059 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002060
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002061 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002062 /* step 1: count the number of %S/%R/%A/%s format specifications
2063 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2064 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002065 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002066 * also estimate a upper bound for all the number formats in the string,
2067 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002068 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002069 for (f = format; *f; f++) {
2070 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002071 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002072 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2073 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2074 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2075 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002076
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002077 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002078#ifdef HAVE_LONG_LONG
2079 if (longlongflag) {
2080 if (width < MAX_LONG_LONG_CHARS)
2081 width = MAX_LONG_LONG_CHARS;
2082 }
2083 else
2084#endif
2085 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2086 including sign. Decimal takes the most space. This
2087 isn't enough for octal. If a width is specified we
2088 need more (which we allocate later). */
2089 if (width < MAX_LONG_CHARS)
2090 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002091
2092 /* account for the size + '\0' to separate numbers
2093 inside of the numberresults buffer */
2094 numbersize += (width + 1);
2095 }
2096 }
2097 else if ((unsigned char)*f > 127) {
2098 PyErr_Format(PyExc_ValueError,
2099 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2100 "string, got a non-ASCII byte: 0x%02x",
2101 (unsigned char)*f);
2102 return NULL;
2103 }
2104 }
2105 /* step 2: allocate memory for the results of
2106 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2107 if (callcount) {
2108 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2109 if (!callresults) {
2110 PyErr_NoMemory();
2111 return NULL;
2112 }
2113 callresult = callresults;
2114 }
2115 /* step 2.5: allocate memory for the results of formating numbers */
2116 if (numbersize) {
2117 numberresults = PyObject_Malloc(numbersize);
2118 if (!numberresults) {
2119 PyErr_NoMemory();
2120 goto fail;
2121 }
2122 numberresult = numberresults;
2123 }
2124
2125 /* step 3: format numbers and figure out how large a buffer we need */
2126 for (f = format; *f; f++) {
2127 if (*f == '%') {
2128 const char* p;
2129 int longflag;
2130 int longlongflag;
2131 int size_tflag;
2132 int numprinted;
2133
2134 p = f;
2135 zeropad = (f[1] == '0');
2136 f = parse_format_flags(f, &width, &precision,
2137 &longflag, &longlongflag, &size_tflag);
2138 switch (*f) {
2139 case 'c':
2140 {
2141 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002142 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002143 n++;
2144 break;
2145 }
2146 case '%':
2147 n++;
2148 break;
2149 case 'i':
2150 case 'd':
2151 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2152 width, precision, *f);
2153 if (longflag)
2154 numprinted = sprintf(numberresult, fmt,
2155 va_arg(count, long));
2156#ifdef HAVE_LONG_LONG
2157 else if (longlongflag)
2158 numprinted = sprintf(numberresult, fmt,
2159 va_arg(count, PY_LONG_LONG));
2160#endif
2161 else if (size_tflag)
2162 numprinted = sprintf(numberresult, fmt,
2163 va_arg(count, Py_ssize_t));
2164 else
2165 numprinted = sprintf(numberresult, fmt,
2166 va_arg(count, int));
2167 n += numprinted;
2168 /* advance by +1 to skip over the '\0' */
2169 numberresult += (numprinted + 1);
2170 assert(*(numberresult - 1) == '\0');
2171 assert(*(numberresult - 2) != '\0');
2172 assert(numprinted >= 0);
2173 assert(numberresult <= numberresults + numbersize);
2174 break;
2175 case 'u':
2176 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2177 width, precision, 'u');
2178 if (longflag)
2179 numprinted = sprintf(numberresult, fmt,
2180 va_arg(count, unsigned long));
2181#ifdef HAVE_LONG_LONG
2182 else if (longlongflag)
2183 numprinted = sprintf(numberresult, fmt,
2184 va_arg(count, unsigned PY_LONG_LONG));
2185#endif
2186 else if (size_tflag)
2187 numprinted = sprintf(numberresult, fmt,
2188 va_arg(count, size_t));
2189 else
2190 numprinted = sprintf(numberresult, fmt,
2191 va_arg(count, unsigned int));
2192 n += numprinted;
2193 numberresult += (numprinted + 1);
2194 assert(*(numberresult - 1) == '\0');
2195 assert(*(numberresult - 2) != '\0');
2196 assert(numprinted >= 0);
2197 assert(numberresult <= numberresults + numbersize);
2198 break;
2199 case 'x':
2200 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2201 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2202 n += numprinted;
2203 numberresult += (numprinted + 1);
2204 assert(*(numberresult - 1) == '\0');
2205 assert(*(numberresult - 2) != '\0');
2206 assert(numprinted >= 0);
2207 assert(numberresult <= numberresults + numbersize);
2208 break;
2209 case 'p':
2210 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2211 /* %p is ill-defined: ensure leading 0x. */
2212 if (numberresult[1] == 'X')
2213 numberresult[1] = 'x';
2214 else if (numberresult[1] != 'x') {
2215 memmove(numberresult + 2, numberresult,
2216 strlen(numberresult) + 1);
2217 numberresult[0] = '0';
2218 numberresult[1] = 'x';
2219 numprinted += 2;
2220 }
2221 n += numprinted;
2222 numberresult += (numprinted + 1);
2223 assert(*(numberresult - 1) == '\0');
2224 assert(*(numberresult - 2) != '\0');
2225 assert(numprinted >= 0);
2226 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002227 break;
2228 case 's':
2229 {
2230 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002231 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002232 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2233 if (!str)
2234 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002235 /* since PyUnicode_DecodeUTF8 returns already flexible
2236 unicode objects, there is no need to call ready on them */
2237 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002238 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002239 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002240 /* Remember the str and switch to the next slot */
2241 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002242 break;
2243 }
2244 case 'U':
2245 {
2246 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002247 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002248 if (PyUnicode_READY(obj) == -1)
2249 goto fail;
2250 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002251 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002252 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002253 break;
2254 }
2255 case 'V':
2256 {
2257 PyObject *obj = va_arg(count, PyObject *);
2258 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002259 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002260 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002261 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002262 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002263 if (PyUnicode_READY(obj) == -1)
2264 goto fail;
2265 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002266 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002267 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002268 *callresult++ = NULL;
2269 }
2270 else {
2271 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2272 if (!str_obj)
2273 goto fail;
Victor Stinnere1335c72011-10-04 20:53:03 +02002274 if (PyUnicode_READY(str_obj)) {
2275 Py_DECREF(str_obj);
2276 goto fail;
2277 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002278 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002279 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002280 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002281 *callresult++ = str_obj;
2282 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002283 break;
2284 }
2285 case 'S':
2286 {
2287 PyObject *obj = va_arg(count, PyObject *);
2288 PyObject *str;
2289 assert(obj);
2290 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002291 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002292 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002293 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002294 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002295 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002296 /* Remember the str and switch to the next slot */
2297 *callresult++ = str;
2298 break;
2299 }
2300 case 'R':
2301 {
2302 PyObject *obj = va_arg(count, PyObject *);
2303 PyObject *repr;
2304 assert(obj);
2305 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002306 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002307 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002308 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002309 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002310 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002311 /* Remember the repr and switch to the next slot */
2312 *callresult++ = repr;
2313 break;
2314 }
2315 case 'A':
2316 {
2317 PyObject *obj = va_arg(count, PyObject *);
2318 PyObject *ascii;
2319 assert(obj);
2320 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002321 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002322 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002323 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002324 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002325 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002326 /* Remember the repr and switch to the next slot */
2327 *callresult++ = ascii;
2328 break;
2329 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002330 default:
2331 /* if we stumble upon an unknown
2332 formatting code, copy the rest of
2333 the format string to the output
2334 string. (we cannot just skip the
2335 code, since there's no way to know
2336 what's in the argument list) */
2337 n += strlen(p);
2338 goto expand;
2339 }
2340 } else
2341 n++;
2342 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002343 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002344 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002345 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002346 we don't have to resize the string.
2347 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002348 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002349 if (!string)
2350 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002351 kind = PyUnicode_KIND(string);
2352 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002353 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002354 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002355
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002356 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002357 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002358 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002359
2360 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002361 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2362 /* checking for == because the last argument could be a empty
2363 string, which causes i to point to end, the assert at the end of
2364 the loop */
2365 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002366
Benjamin Peterson14339b62009-01-31 16:36:08 +00002367 switch (*f) {
2368 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002369 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002370 const int ordinal = va_arg(vargs, int);
2371 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002372 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002373 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002374 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002375 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002376 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002377 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002378 case 'p':
2379 /* unused, since we already have the result */
2380 if (*f == 'p')
2381 (void) va_arg(vargs, void *);
2382 else
2383 (void) va_arg(vargs, int);
2384 /* extract the result from numberresults and append. */
2385 for (; *numberresult; ++i, ++numberresult)
2386 PyUnicode_WRITE(kind, data, i, *numberresult);
2387 /* skip over the separating '\0' */
2388 assert(*numberresult == '\0');
2389 numberresult++;
2390 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002391 break;
2392 case 's':
2393 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002394 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002395 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002396 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002397 size = PyUnicode_GET_LENGTH(*callresult);
2398 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002399 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002400 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002401 /* We're done with the unicode()/repr() => forget it */
2402 Py_DECREF(*callresult);
2403 /* switch to next unicode()/repr() result */
2404 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002405 break;
2406 }
2407 case 'U':
2408 {
2409 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002410 Py_ssize_t size;
2411 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2412 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002413 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002414 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002415 break;
2416 }
2417 case 'V':
2418 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002419 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002420 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002421 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002422 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002423 size = PyUnicode_GET_LENGTH(obj);
2424 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002425 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002426 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002427 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002428 size = PyUnicode_GET_LENGTH(*callresult);
2429 assert(PyUnicode_KIND(*callresult) <=
2430 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002431 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002432 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002433 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002434 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002435 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002436 break;
2437 }
2438 case 'S':
2439 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002440 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002441 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002442 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002443 /* unused, since we already have the result */
2444 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002445 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002446 copy_characters(string, i, *callresult, 0, size);
2447 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002448 /* We're done with the unicode()/repr() => forget it */
2449 Py_DECREF(*callresult);
2450 /* switch to next unicode()/repr() result */
2451 ++callresult;
2452 break;
2453 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002454 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002455 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002456 break;
2457 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002458 for (; *p; ++p, ++i)
2459 PyUnicode_WRITE(kind, data, i, *p);
2460 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002461 goto end;
2462 }
Victor Stinner1205f272010-09-11 00:54:47 +00002463 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002464 else {
2465 assert(i < PyUnicode_GET_LENGTH(string));
2466 PyUnicode_WRITE(kind, data, i++, *f);
2467 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002468 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002469 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002470
Benjamin Peterson29060642009-01-31 22:14:21 +00002471 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002472 if (callresults)
2473 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002474 if (numberresults)
2475 PyObject_Free(numberresults);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002476 assert(_PyUnicode_CheckConsistency(string, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002477 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00002478 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002479 if (callresults) {
2480 PyObject **callresult2 = callresults;
2481 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002482 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002483 ++callresult2;
2484 }
2485 PyObject_Free(callresults);
2486 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002487 if (numberresults)
2488 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002489 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002490}
2491
Walter Dörwaldd2034312007-05-18 16:29:38 +00002492PyObject *
2493PyUnicode_FromFormat(const char *format, ...)
2494{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002495 PyObject* ret;
2496 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002497
2498#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002499 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002500#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002501 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002502#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002503 ret = PyUnicode_FromFormatV(format, vargs);
2504 va_end(vargs);
2505 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002506}
2507
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002508#ifdef HAVE_WCHAR_H
2509
Victor Stinner5593d8a2010-10-02 11:11:27 +00002510/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2511 convert a Unicode object to a wide character string.
2512
Victor Stinnerd88d9832011-09-06 02:00:05 +02002513 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002514 character) required to convert the unicode object. Ignore size argument.
2515
Victor Stinnerd88d9832011-09-06 02:00:05 +02002516 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002517 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002518 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002519static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00002520unicode_aswidechar(PyUnicodeObject *unicode,
2521 wchar_t *w,
2522 Py_ssize_t size)
2523{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002524 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002525 const wchar_t *wstr;
2526
2527 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
2528 if (wstr == NULL)
2529 return -1;
2530
Victor Stinner5593d8a2010-10-02 11:11:27 +00002531 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002532 if (size > res)
2533 size = res + 1;
2534 else
2535 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002536 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002537 return res;
2538 }
2539 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002540 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002541}
2542
2543Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002544PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002545 wchar_t *w,
2546 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002547{
2548 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002549 PyErr_BadInternalCall();
2550 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002551 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002552 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002553}
2554
Victor Stinner137c34c2010-09-29 10:25:54 +00002555wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002556PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002557 Py_ssize_t *size)
2558{
2559 wchar_t* buffer;
2560 Py_ssize_t buflen;
2561
2562 if (unicode == NULL) {
2563 PyErr_BadInternalCall();
2564 return NULL;
2565 }
2566
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002567 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002568 if (buflen == -1)
2569 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002570 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002571 PyErr_NoMemory();
2572 return NULL;
2573 }
2574
Victor Stinner137c34c2010-09-29 10:25:54 +00002575 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2576 if (buffer == NULL) {
2577 PyErr_NoMemory();
2578 return NULL;
2579 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002580 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002581 if (buflen == -1)
2582 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002583 if (size != NULL)
2584 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002585 return buffer;
2586}
2587
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002588#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002589
Alexander Belopolsky40018472011-02-26 01:02:56 +00002590PyObject *
2591PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002592{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002593 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002594 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002595 PyErr_SetString(PyExc_ValueError,
2596 "chr() arg not in range(0x110000)");
2597 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002598 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002599
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002600 if (ordinal < 256)
2601 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002602
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002603 v = PyUnicode_New(1, ordinal);
2604 if (v == NULL)
2605 return NULL;
2606 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002607 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002608 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002609}
2610
Alexander Belopolsky40018472011-02-26 01:02:56 +00002611PyObject *
2612PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002613{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002614 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002615 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002616 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002617 if (PyUnicode_READY(obj))
2618 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002619 Py_INCREF(obj);
2620 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002621 }
2622 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002623 /* For a Unicode subtype that's not a Unicode object,
2624 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002625 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002626 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002627 PyErr_Format(PyExc_TypeError,
2628 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002629 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002630 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002631}
2632
Alexander Belopolsky40018472011-02-26 01:02:56 +00002633PyObject *
2634PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002635 const char *encoding,
2636 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002637{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002638 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002639 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002640
Guido van Rossumd57fd912000-03-10 22:53:23 +00002641 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002642 PyErr_BadInternalCall();
2643 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002644 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002645
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002646 /* Decoding bytes objects is the most common case and should be fast */
2647 if (PyBytes_Check(obj)) {
2648 if (PyBytes_GET_SIZE(obj) == 0) {
2649 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002650 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002651 }
2652 else {
2653 v = PyUnicode_Decode(
2654 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2655 encoding, errors);
2656 }
2657 return v;
2658 }
2659
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002660 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002661 PyErr_SetString(PyExc_TypeError,
2662 "decoding str is not supported");
2663 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002664 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002665
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002666 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2667 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2668 PyErr_Format(PyExc_TypeError,
2669 "coercing to str: need bytes, bytearray "
2670 "or buffer-like object, %.80s found",
2671 Py_TYPE(obj)->tp_name);
2672 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002673 }
Tim Petersced69f82003-09-16 20:30:58 +00002674
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002675 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002676 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002677 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002678 }
Tim Petersced69f82003-09-16 20:30:58 +00002679 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002680 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002681
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002682 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002683 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002684}
2685
Victor Stinner600d3be2010-06-10 12:00:55 +00002686/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002687 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2688 1 on success. */
2689static int
2690normalize_encoding(const char *encoding,
2691 char *lower,
2692 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002693{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002694 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002695 char *l;
2696 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002697
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002698 e = encoding;
2699 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002700 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002701 while (*e) {
2702 if (l == l_end)
2703 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002704 if (Py_ISUPPER(*e)) {
2705 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002706 }
2707 else if (*e == '_') {
2708 *l++ = '-';
2709 e++;
2710 }
2711 else {
2712 *l++ = *e++;
2713 }
2714 }
2715 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002716 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002717}
2718
Alexander Belopolsky40018472011-02-26 01:02:56 +00002719PyObject *
2720PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002721 Py_ssize_t size,
2722 const char *encoding,
2723 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002724{
2725 PyObject *buffer = NULL, *unicode;
2726 Py_buffer info;
2727 char lower[11]; /* Enough for any encoding shortcut */
2728
2729 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002730 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002731
2732 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002733 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002734 if ((strcmp(lower, "utf-8") == 0) ||
2735 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002736 return PyUnicode_DecodeUTF8(s, size, errors);
2737 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002738 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002739 (strcmp(lower, "iso-8859-1") == 0))
2740 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002741#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002742 else if (strcmp(lower, "mbcs") == 0)
2743 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002744#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002745 else if (strcmp(lower, "ascii") == 0)
2746 return PyUnicode_DecodeASCII(s, size, errors);
2747 else if (strcmp(lower, "utf-16") == 0)
2748 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2749 else if (strcmp(lower, "utf-32") == 0)
2750 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2751 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002752
2753 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002754 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002755 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002756 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002757 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002758 if (buffer == NULL)
2759 goto onError;
2760 unicode = PyCodec_Decode(buffer, encoding, errors);
2761 if (unicode == NULL)
2762 goto onError;
2763 if (!PyUnicode_Check(unicode)) {
2764 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002765 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002766 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002767 Py_DECREF(unicode);
2768 goto onError;
2769 }
2770 Py_DECREF(buffer);
Victor Stinner17efeed2011-10-04 20:05:46 +02002771#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02002772 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002773 Py_DECREF(unicode);
2774 return NULL;
2775 }
Victor Stinner17efeed2011-10-04 20:05:46 +02002776#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002777 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002778 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002779
Benjamin Peterson29060642009-01-31 22:14:21 +00002780 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002781 Py_XDECREF(buffer);
2782 return NULL;
2783}
2784
Alexander Belopolsky40018472011-02-26 01:02:56 +00002785PyObject *
2786PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002787 const char *encoding,
2788 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002789{
2790 PyObject *v;
2791
2792 if (!PyUnicode_Check(unicode)) {
2793 PyErr_BadArgument();
2794 goto onError;
2795 }
2796
2797 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002798 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002799
2800 /* Decode via the codec registry */
2801 v = PyCodec_Decode(unicode, encoding, errors);
2802 if (v == NULL)
2803 goto onError;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002804 assert(_PyUnicode_CheckConsistency(v, 1));
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002805 return v;
2806
Benjamin Peterson29060642009-01-31 22:14:21 +00002807 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002808 return NULL;
2809}
2810
Alexander Belopolsky40018472011-02-26 01:02:56 +00002811PyObject *
2812PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002813 const char *encoding,
2814 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002815{
2816 PyObject *v;
2817
2818 if (!PyUnicode_Check(unicode)) {
2819 PyErr_BadArgument();
2820 goto onError;
2821 }
2822
2823 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002824 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002825
2826 /* Decode via the codec registry */
2827 v = PyCodec_Decode(unicode, encoding, errors);
2828 if (v == NULL)
2829 goto onError;
2830 if (!PyUnicode_Check(v)) {
2831 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002832 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002833 Py_TYPE(v)->tp_name);
2834 Py_DECREF(v);
2835 goto onError;
2836 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002837 assert(_PyUnicode_CheckConsistency(v, 1));
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002838 return v;
2839
Benjamin Peterson29060642009-01-31 22:14:21 +00002840 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002841 return NULL;
2842}
2843
Alexander Belopolsky40018472011-02-26 01:02:56 +00002844PyObject *
2845PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002846 Py_ssize_t size,
2847 const char *encoding,
2848 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002849{
2850 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002851
Guido van Rossumd57fd912000-03-10 22:53:23 +00002852 unicode = PyUnicode_FromUnicode(s, size);
2853 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002854 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002855 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2856 Py_DECREF(unicode);
2857 return v;
2858}
2859
Alexander Belopolsky40018472011-02-26 01:02:56 +00002860PyObject *
2861PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002862 const char *encoding,
2863 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002864{
2865 PyObject *v;
2866
2867 if (!PyUnicode_Check(unicode)) {
2868 PyErr_BadArgument();
2869 goto onError;
2870 }
2871
2872 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002873 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002874
2875 /* Encode via the codec registry */
2876 v = PyCodec_Encode(unicode, encoding, errors);
2877 if (v == NULL)
2878 goto onError;
2879 return v;
2880
Benjamin Peterson29060642009-01-31 22:14:21 +00002881 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002882 return NULL;
2883}
2884
Victor Stinnerad158722010-10-27 00:25:46 +00002885PyObject *
2886PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002887{
Victor Stinner99b95382011-07-04 14:23:54 +02002888#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002889 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2890 PyUnicode_GET_SIZE(unicode),
2891 NULL);
2892#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002893 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002894#else
Victor Stinner793b5312011-04-27 00:24:21 +02002895 PyInterpreterState *interp = PyThreadState_GET()->interp;
2896 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2897 cannot use it to encode and decode filenames before it is loaded. Load
2898 the Python codec requires to encode at least its own filename. Use the C
2899 version of the locale codec until the codec registry is initialized and
2900 the Python codec is loaded.
2901
2902 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2903 cannot only rely on it: check also interp->fscodec_initialized for
2904 subinterpreters. */
2905 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00002906 return PyUnicode_AsEncodedString(unicode,
2907 Py_FileSystemDefaultEncoding,
2908 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00002909 }
2910 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002911 /* locale encoding with surrogateescape */
2912 wchar_t *wchar;
2913 char *bytes;
2914 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00002915 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002916
2917 wchar = PyUnicode_AsWideCharString(unicode, NULL);
2918 if (wchar == NULL)
2919 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002920 bytes = _Py_wchar2char(wchar, &error_pos);
2921 if (bytes == NULL) {
2922 if (error_pos != (size_t)-1) {
2923 char *errmsg = strerror(errno);
2924 PyObject *exc = NULL;
2925 if (errmsg == NULL)
2926 errmsg = "Py_wchar2char() failed";
2927 raise_encode_exception(&exc,
2928 "filesystemencoding",
2929 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2930 error_pos, error_pos+1,
2931 errmsg);
2932 Py_XDECREF(exc);
2933 }
2934 else
2935 PyErr_NoMemory();
2936 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002937 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002938 }
2939 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002940
2941 bytes_obj = PyBytes_FromString(bytes);
2942 PyMem_Free(bytes);
2943 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00002944 }
Victor Stinnerad158722010-10-27 00:25:46 +00002945#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00002946}
2947
Alexander Belopolsky40018472011-02-26 01:02:56 +00002948PyObject *
2949PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002950 const char *encoding,
2951 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002952{
2953 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00002954 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00002955
Guido van Rossumd57fd912000-03-10 22:53:23 +00002956 if (!PyUnicode_Check(unicode)) {
2957 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002958 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002959 }
Fred Drakee4315f52000-05-09 19:53:39 +00002960
Victor Stinner2f283c22011-03-02 01:21:46 +00002961 if (encoding == NULL) {
2962 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002963 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002964 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002965 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00002966 }
Fred Drakee4315f52000-05-09 19:53:39 +00002967
2968 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002969 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002970 if ((strcmp(lower, "utf-8") == 0) ||
2971 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00002972 {
Victor Stinner2f283c22011-03-02 01:21:46 +00002973 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002974 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002975 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002976 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00002977 }
Victor Stinner37296e82010-06-10 13:36:23 +00002978 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002979 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002980 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002981 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002982#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002983 else if (strcmp(lower, "mbcs") == 0)
2984 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2985 PyUnicode_GET_SIZE(unicode),
2986 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002987#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002988 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002989 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00002990 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002991
2992 /* Encode via the codec registry */
2993 v = PyCodec_Encode(unicode, encoding, errors);
2994 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002995 return NULL;
2996
2997 /* The normal path */
2998 if (PyBytes_Check(v))
2999 return v;
3000
3001 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003002 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003003 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003004 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003005
3006 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3007 "encoder %s returned bytearray instead of bytes",
3008 encoding);
3009 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003010 Py_DECREF(v);
3011 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003012 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003013
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003014 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3015 Py_DECREF(v);
3016 return b;
3017 }
3018
3019 PyErr_Format(PyExc_TypeError,
3020 "encoder did not return a bytes object (type=%.400s)",
3021 Py_TYPE(v)->tp_name);
3022 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003023 return NULL;
3024}
3025
Alexander Belopolsky40018472011-02-26 01:02:56 +00003026PyObject *
3027PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003028 const char *encoding,
3029 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003030{
3031 PyObject *v;
3032
3033 if (!PyUnicode_Check(unicode)) {
3034 PyErr_BadArgument();
3035 goto onError;
3036 }
3037
3038 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003039 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003040
3041 /* Encode via the codec registry */
3042 v = PyCodec_Encode(unicode, encoding, errors);
3043 if (v == NULL)
3044 goto onError;
3045 if (!PyUnicode_Check(v)) {
3046 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003047 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003048 Py_TYPE(v)->tp_name);
3049 Py_DECREF(v);
3050 goto onError;
3051 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003052 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003053
Benjamin Peterson29060642009-01-31 22:14:21 +00003054 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003055 return NULL;
3056}
3057
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003058PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003059PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003060 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003061 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3062}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003063
Christian Heimes5894ba72007-11-04 11:43:14 +00003064PyObject*
3065PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3066{
Victor Stinner99b95382011-07-04 14:23:54 +02003067#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003068 return PyUnicode_DecodeMBCS(s, size, NULL);
3069#elif defined(__APPLE__)
3070 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
3071#else
Victor Stinner793b5312011-04-27 00:24:21 +02003072 PyInterpreterState *interp = PyThreadState_GET()->interp;
3073 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3074 cannot use it to encode and decode filenames before it is loaded. Load
3075 the Python codec requires to encode at least its own filename. Use the C
3076 version of the locale codec until the codec registry is initialized and
3077 the Python codec is loaded.
3078
3079 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3080 cannot only rely on it: check also interp->fscodec_initialized for
3081 subinterpreters. */
3082 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003083 return PyUnicode_Decode(s, size,
3084 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003085 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003086 }
3087 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003088 /* locale encoding with surrogateescape */
3089 wchar_t *wchar;
3090 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00003091 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003092
3093 if (s[size] != '\0' || size != strlen(s)) {
3094 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3095 return NULL;
3096 }
3097
Victor Stinner168e1172010-10-16 23:16:16 +00003098 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003099 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00003100 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003101
Victor Stinner168e1172010-10-16 23:16:16 +00003102 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003103 PyMem_Free(wchar);
3104 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003105 }
Victor Stinnerad158722010-10-27 00:25:46 +00003106#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003107}
3108
Martin v. Löwis011e8422009-05-05 04:43:17 +00003109
3110int
3111PyUnicode_FSConverter(PyObject* arg, void* addr)
3112{
3113 PyObject *output = NULL;
3114 Py_ssize_t size;
3115 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003116 if (arg == NULL) {
3117 Py_DECREF(*(PyObject**)addr);
3118 return 1;
3119 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003120 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003121 output = arg;
3122 Py_INCREF(output);
3123 }
3124 else {
3125 arg = PyUnicode_FromObject(arg);
3126 if (!arg)
3127 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003128 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003129 Py_DECREF(arg);
3130 if (!output)
3131 return 0;
3132 if (!PyBytes_Check(output)) {
3133 Py_DECREF(output);
3134 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3135 return 0;
3136 }
3137 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003138 size = PyBytes_GET_SIZE(output);
3139 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003140 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003141 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003142 Py_DECREF(output);
3143 return 0;
3144 }
3145 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003146 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003147}
3148
3149
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003150int
3151PyUnicode_FSDecoder(PyObject* arg, void* addr)
3152{
3153 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003154 if (arg == NULL) {
3155 Py_DECREF(*(PyObject**)addr);
3156 return 1;
3157 }
3158 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003159 if (PyUnicode_READY(arg))
3160 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003161 output = arg;
3162 Py_INCREF(output);
3163 }
3164 else {
3165 arg = PyBytes_FromObject(arg);
3166 if (!arg)
3167 return 0;
3168 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3169 PyBytes_GET_SIZE(arg));
3170 Py_DECREF(arg);
3171 if (!output)
3172 return 0;
3173 if (!PyUnicode_Check(output)) {
3174 Py_DECREF(output);
3175 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3176 return 0;
3177 }
3178 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003179 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3180 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003181 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3182 Py_DECREF(output);
3183 return 0;
3184 }
3185 *(PyObject**)addr = output;
3186 return Py_CLEANUP_SUPPORTED;
3187}
3188
3189
Martin v. Löwis5b222132007-06-10 09:51:05 +00003190char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003191PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003192{
Christian Heimesf3863112007-11-22 07:46:41 +00003193 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003194 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
3195
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003196 if (!PyUnicode_Check(unicode)) {
3197 PyErr_BadArgument();
3198 return NULL;
3199 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003200 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003201 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003202
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003203 if (PyUnicode_UTF8(unicode) == NULL) {
3204 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003205 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3206 if (bytes == NULL)
3207 return NULL;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003208 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3209 if (_PyUnicode_UTF8(u) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003210 Py_DECREF(bytes);
3211 return NULL;
3212 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003213 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
3214 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003215 Py_DECREF(bytes);
3216 }
3217
3218 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003219 *psize = PyUnicode_UTF8_LENGTH(unicode);
3220 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003221}
3222
3223char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003224PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003225{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003226 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3227}
3228
3229#ifdef Py_DEBUG
3230int unicode_as_unicode_calls = 0;
3231#endif
3232
3233
3234Py_UNICODE *
3235PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3236{
3237 PyUnicodeObject *u;
3238 const unsigned char *one_byte;
3239#if SIZEOF_WCHAR_T == 4
3240 const Py_UCS2 *two_bytes;
3241#else
3242 const Py_UCS4 *four_bytes;
3243 const Py_UCS4 *ucs4_end;
3244 Py_ssize_t num_surrogates;
3245#endif
3246 wchar_t *w;
3247 wchar_t *wchar_end;
3248
3249 if (!PyUnicode_Check(unicode)) {
3250 PyErr_BadArgument();
3251 return NULL;
3252 }
3253 u = (PyUnicodeObject*)unicode;
3254 if (_PyUnicode_WSTR(u) == NULL) {
3255 /* Non-ASCII compact unicode object */
3256 assert(_PyUnicode_KIND(u) != 0);
3257 assert(PyUnicode_IS_READY(u));
3258
3259#ifdef Py_DEBUG
3260 ++unicode_as_unicode_calls;
3261#endif
3262
3263 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
3264#if SIZEOF_WCHAR_T == 2
3265 four_bytes = PyUnicode_4BYTE_DATA(u);
3266 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
3267 num_surrogates = 0;
3268
3269 for (; four_bytes < ucs4_end; ++four_bytes) {
3270 if (*four_bytes > 0xFFFF)
3271 ++num_surrogates;
3272 }
3273
3274 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
3275 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
3276 if (!_PyUnicode_WSTR(u)) {
3277 PyErr_NoMemory();
3278 return NULL;
3279 }
3280 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
3281
3282 w = _PyUnicode_WSTR(u);
3283 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
3284 four_bytes = PyUnicode_4BYTE_DATA(u);
3285 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3286 if (*four_bytes > 0xFFFF) {
3287 /* encode surrogate pair in this case */
3288 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3289 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3290 }
3291 else
3292 *w = *four_bytes;
3293
3294 if (w > wchar_end) {
3295 assert(0 && "Miscalculated string end");
3296 }
3297 }
3298 *w = 0;
3299#else
3300 /* sizeof(wchar_t) == 4 */
3301 Py_FatalError("Impossible unicode object state, wstr and str "
3302 "should share memory already.");
3303 return NULL;
3304#endif
3305 }
3306 else {
3307 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3308 (_PyUnicode_LENGTH(u) + 1));
3309 if (!_PyUnicode_WSTR(u)) {
3310 PyErr_NoMemory();
3311 return NULL;
3312 }
3313 if (!PyUnicode_IS_COMPACT_ASCII(u))
3314 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
3315 w = _PyUnicode_WSTR(u);
3316 wchar_end = w + _PyUnicode_LENGTH(u);
3317
3318 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
3319 one_byte = PyUnicode_1BYTE_DATA(u);
3320 for (; w < wchar_end; ++one_byte, ++w)
3321 *w = *one_byte;
3322 /* null-terminate the wstr */
3323 *w = 0;
3324 }
3325 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
3326#if SIZEOF_WCHAR_T == 4
3327 two_bytes = PyUnicode_2BYTE_DATA(u);
3328 for (; w < wchar_end; ++two_bytes, ++w)
3329 *w = *two_bytes;
3330 /* null-terminate the wstr */
3331 *w = 0;
3332#else
3333 /* sizeof(wchar_t) == 2 */
3334 PyObject_FREE(_PyUnicode_WSTR(u));
3335 _PyUnicode_WSTR(u) = NULL;
3336 Py_FatalError("Impossible unicode object state, wstr "
3337 "and str should share memory already.");
3338 return NULL;
3339#endif
3340 }
3341 else {
3342 assert(0 && "This should never happen.");
3343 }
3344 }
3345 }
3346 if (size != NULL)
3347 *size = PyUnicode_WSTR_LENGTH(u);
3348 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003349}
3350
Alexander Belopolsky40018472011-02-26 01:02:56 +00003351Py_UNICODE *
3352PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003353{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003354 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003355}
3356
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003357
Alexander Belopolsky40018472011-02-26 01:02:56 +00003358Py_ssize_t
3359PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003360{
3361 if (!PyUnicode_Check(unicode)) {
3362 PyErr_BadArgument();
3363 goto onError;
3364 }
3365 return PyUnicode_GET_SIZE(unicode);
3366
Benjamin Peterson29060642009-01-31 22:14:21 +00003367 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003368 return -1;
3369}
3370
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003371Py_ssize_t
3372PyUnicode_GetLength(PyObject *unicode)
3373{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003374 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003375 PyErr_BadArgument();
3376 return -1;
3377 }
3378
3379 return PyUnicode_GET_LENGTH(unicode);
3380}
3381
3382Py_UCS4
3383PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3384{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003385 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3386 PyErr_BadArgument();
3387 return (Py_UCS4)-1;
3388 }
3389 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3390 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003391 return (Py_UCS4)-1;
3392 }
3393 return PyUnicode_READ_CHAR(unicode, index);
3394}
3395
3396int
3397PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3398{
3399 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003400 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003401 return -1;
3402 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003403 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3404 PyErr_SetString(PyExc_IndexError, "string index out of range");
3405 return -1;
3406 }
3407 if (_PyUnicode_Dirty(unicode))
3408 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003409 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3410 index, ch);
3411 return 0;
3412}
3413
Alexander Belopolsky40018472011-02-26 01:02:56 +00003414const char *
3415PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003416{
Victor Stinner42cb4622010-09-01 19:39:01 +00003417 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003418}
3419
Victor Stinner554f3f02010-06-16 23:33:54 +00003420/* create or adjust a UnicodeDecodeError */
3421static void
3422make_decode_exception(PyObject **exceptionObject,
3423 const char *encoding,
3424 const char *input, Py_ssize_t length,
3425 Py_ssize_t startpos, Py_ssize_t endpos,
3426 const char *reason)
3427{
3428 if (*exceptionObject == NULL) {
3429 *exceptionObject = PyUnicodeDecodeError_Create(
3430 encoding, input, length, startpos, endpos, reason);
3431 }
3432 else {
3433 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3434 goto onError;
3435 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3436 goto onError;
3437 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3438 goto onError;
3439 }
3440 return;
3441
3442onError:
3443 Py_DECREF(*exceptionObject);
3444 *exceptionObject = NULL;
3445}
3446
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003447/* error handling callback helper:
3448 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003449 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003450 and adjust various state variables.
3451 return 0 on success, -1 on error
3452*/
3453
Alexander Belopolsky40018472011-02-26 01:02:56 +00003454static int
3455unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003456 const char *encoding, const char *reason,
3457 const char **input, const char **inend, Py_ssize_t *startinpos,
3458 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3459 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003460{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003461 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003462
3463 PyObject *restuple = NULL;
3464 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003465 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003466 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003467 Py_ssize_t requiredsize;
3468 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003469 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003470 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003471 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003472 int res = -1;
3473
3474 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003475 *errorHandler = PyCodec_LookupError(errors);
3476 if (*errorHandler == NULL)
3477 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003478 }
3479
Victor Stinner554f3f02010-06-16 23:33:54 +00003480 make_decode_exception(exceptionObject,
3481 encoding,
3482 *input, *inend - *input,
3483 *startinpos, *endinpos,
3484 reason);
3485 if (*exceptionObject == NULL)
3486 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003487
3488 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3489 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003490 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003491 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003492 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003493 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003494 }
3495 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003496 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003497
3498 /* Copy back the bytes variables, which might have been modified by the
3499 callback */
3500 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3501 if (!inputobj)
3502 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003503 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003504 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003505 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003506 *input = PyBytes_AS_STRING(inputobj);
3507 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003508 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003509 /* we can DECREF safely, as the exception has another reference,
3510 so the object won't go away. */
3511 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003512
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003513 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003514 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003515 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003516 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3517 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003518 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003519
3520 /* need more space? (at least enough for what we
3521 have+the replacement+the rest of the string (starting
3522 at the new input position), so we won't have to check space
3523 when there are no errors in the rest of the string) */
3524 repptr = PyUnicode_AS_UNICODE(repunicode);
3525 repsize = PyUnicode_GET_SIZE(repunicode);
3526 requiredsize = *outpos + repsize + insize-newpos;
3527 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003528 if (requiredsize<2*outsize)
3529 requiredsize = 2*outsize;
Victor Stinnerfe226c02011-10-03 03:52:20 +02003530 if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003531 goto onError;
3532 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003533 }
3534 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003535 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003536 Py_UNICODE_COPY(*outptr, repptr, repsize);
3537 *outptr += repsize;
3538 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003539
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003540 /* we made it! */
3541 res = 0;
3542
Benjamin Peterson29060642009-01-31 22:14:21 +00003543 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003544 Py_XDECREF(restuple);
3545 return res;
3546}
3547
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003548/* --- UTF-7 Codec -------------------------------------------------------- */
3549
Antoine Pitrou244651a2009-05-04 18:56:13 +00003550/* See RFC2152 for details. We encode conservatively and decode liberally. */
3551
3552/* Three simple macros defining base-64. */
3553
3554/* Is c a base-64 character? */
3555
3556#define IS_BASE64(c) \
3557 (((c) >= 'A' && (c) <= 'Z') || \
3558 ((c) >= 'a' && (c) <= 'z') || \
3559 ((c) >= '0' && (c) <= '9') || \
3560 (c) == '+' || (c) == '/')
3561
3562/* given that c is a base-64 character, what is its base-64 value? */
3563
3564#define FROM_BASE64(c) \
3565 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3566 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3567 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3568 (c) == '+' ? 62 : 63)
3569
3570/* What is the base-64 character of the bottom 6 bits of n? */
3571
3572#define TO_BASE64(n) \
3573 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3574
3575/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3576 * decoded as itself. We are permissive on decoding; the only ASCII
3577 * byte not decoding to itself is the + which begins a base64
3578 * string. */
3579
3580#define DECODE_DIRECT(c) \
3581 ((c) <= 127 && (c) != '+')
3582
3583/* The UTF-7 encoder treats ASCII characters differently according to
3584 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3585 * the above). See RFC2152. This array identifies these different
3586 * sets:
3587 * 0 : "Set D"
3588 * alphanumeric and '(),-./:?
3589 * 1 : "Set O"
3590 * !"#$%&*;<=>@[]^_`{|}
3591 * 2 : "whitespace"
3592 * ht nl cr sp
3593 * 3 : special (must be base64 encoded)
3594 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3595 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003596
Tim Petersced69f82003-09-16 20:30:58 +00003597static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003598char utf7_category[128] = {
3599/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3600 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3601/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3602 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3603/* sp ! " # $ % & ' ( ) * + , - . / */
3604 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3605/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3606 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3607/* @ A B C D E F G H I J K L M N O */
3608 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3609/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3610 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3611/* ` a b c d e f g h i j k l m n o */
3612 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3613/* p q r s t u v w x y z { | } ~ del */
3614 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003615};
3616
Antoine Pitrou244651a2009-05-04 18:56:13 +00003617/* ENCODE_DIRECT: this character should be encoded as itself. The
3618 * answer depends on whether we are encoding set O as itself, and also
3619 * on whether we are encoding whitespace as itself. RFC2152 makes it
3620 * clear that the answers to these questions vary between
3621 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003622
Antoine Pitrou244651a2009-05-04 18:56:13 +00003623#define ENCODE_DIRECT(c, directO, directWS) \
3624 ((c) < 128 && (c) > 0 && \
3625 ((utf7_category[(c)] == 0) || \
3626 (directWS && (utf7_category[(c)] == 2)) || \
3627 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003628
Alexander Belopolsky40018472011-02-26 01:02:56 +00003629PyObject *
3630PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003631 Py_ssize_t size,
3632 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003633{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003634 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3635}
3636
Antoine Pitrou244651a2009-05-04 18:56:13 +00003637/* The decoder. The only state we preserve is our read position,
3638 * i.e. how many characters we have consumed. So if we end in the
3639 * middle of a shift sequence we have to back off the read position
3640 * and the output to the beginning of the sequence, otherwise we lose
3641 * all the shift state (seen bits, number of bits seen, high
3642 * surrogate). */
3643
Alexander Belopolsky40018472011-02-26 01:02:56 +00003644PyObject *
3645PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003646 Py_ssize_t size,
3647 const char *errors,
3648 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003649{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003650 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003651 Py_ssize_t startinpos;
3652 Py_ssize_t endinpos;
3653 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003654 const char *e;
3655 PyUnicodeObject *unicode;
3656 Py_UNICODE *p;
3657 const char *errmsg = "";
3658 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003659 Py_UNICODE *shiftOutStart;
3660 unsigned int base64bits = 0;
3661 unsigned long base64buffer = 0;
3662 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003663 PyObject *errorHandler = NULL;
3664 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003665
3666 unicode = _PyUnicode_New(size);
3667 if (!unicode)
3668 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003669 if (size == 0) {
3670 if (consumed)
3671 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003672 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003673 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003674
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003675 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003676 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003677 e = s + size;
3678
3679 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003680 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003681 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003682 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003683
Antoine Pitrou244651a2009-05-04 18:56:13 +00003684 if (inShift) { /* in a base-64 section */
3685 if (IS_BASE64(ch)) { /* consume a base-64 character */
3686 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3687 base64bits += 6;
3688 s++;
3689 if (base64bits >= 16) {
3690 /* we have enough bits for a UTF-16 value */
3691 Py_UNICODE outCh = (Py_UNICODE)
3692 (base64buffer >> (base64bits-16));
3693 base64bits -= 16;
3694 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3695 if (surrogate) {
3696 /* expecting a second surrogate */
3697 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3698#ifdef Py_UNICODE_WIDE
3699 *p++ = (((surrogate & 0x3FF)<<10)
3700 | (outCh & 0x3FF)) + 0x10000;
3701#else
3702 *p++ = surrogate;
3703 *p++ = outCh;
3704#endif
3705 surrogate = 0;
3706 }
3707 else {
3708 surrogate = 0;
3709 errmsg = "second surrogate missing";
3710 goto utf7Error;
3711 }
3712 }
3713 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3714 /* first surrogate */
3715 surrogate = outCh;
3716 }
3717 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3718 errmsg = "unexpected second surrogate";
3719 goto utf7Error;
3720 }
3721 else {
3722 *p++ = outCh;
3723 }
3724 }
3725 }
3726 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003727 inShift = 0;
3728 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003729 if (surrogate) {
3730 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003731 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003732 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003733 if (base64bits > 0) { /* left-over bits */
3734 if (base64bits >= 6) {
3735 /* We've seen at least one base-64 character */
3736 errmsg = "partial character in shift sequence";
3737 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003738 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003739 else {
3740 /* Some bits remain; they should be zero */
3741 if (base64buffer != 0) {
3742 errmsg = "non-zero padding bits in shift sequence";
3743 goto utf7Error;
3744 }
3745 }
3746 }
3747 if (ch != '-') {
3748 /* '-' is absorbed; other terminating
3749 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003750 *p++ = ch;
3751 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003752 }
3753 }
3754 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003755 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003756 s++; /* consume '+' */
3757 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003758 s++;
3759 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003760 }
3761 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003762 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003763 shiftOutStart = p;
3764 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003765 }
3766 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003767 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003768 *p++ = ch;
3769 s++;
3770 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003771 else {
3772 startinpos = s-starts;
3773 s++;
3774 errmsg = "unexpected special character";
3775 goto utf7Error;
3776 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003777 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003778utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003779 outpos = p-PyUnicode_AS_UNICODE(unicode);
3780 endinpos = s-starts;
3781 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003782 errors, &errorHandler,
3783 "utf7", errmsg,
3784 &starts, &e, &startinpos, &endinpos, &exc, &s,
3785 &unicode, &outpos, &p))
3786 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003787 }
3788
Antoine Pitrou244651a2009-05-04 18:56:13 +00003789 /* end of string */
3790
3791 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3792 /* if we're in an inconsistent state, that's an error */
3793 if (surrogate ||
3794 (base64bits >= 6) ||
3795 (base64bits > 0 && base64buffer != 0)) {
3796 outpos = p-PyUnicode_AS_UNICODE(unicode);
3797 endinpos = size;
3798 if (unicode_decode_call_errorhandler(
3799 errors, &errorHandler,
3800 "utf7", "unterminated shift sequence",
3801 &starts, &e, &startinpos, &endinpos, &exc, &s,
3802 &unicode, &outpos, &p))
3803 goto onError;
3804 if (s < e)
3805 goto restart;
3806 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003807 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003808
3809 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003810 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003811 if (inShift) {
3812 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003813 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003814 }
3815 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003816 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003817 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003818 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003819
Victor Stinnerfe226c02011-10-03 03:52:20 +02003820 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003821 goto onError;
3822
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003823 Py_XDECREF(errorHandler);
3824 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02003825#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02003826 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003827 Py_DECREF(unicode);
3828 return NULL;
3829 }
Victor Stinner17efeed2011-10-04 20:05:46 +02003830#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02003831 assert(_PyUnicode_CheckConsistency(unicode, 1));
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003832 return (PyObject *)unicode;
3833
Benjamin Peterson29060642009-01-31 22:14:21 +00003834 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003835 Py_XDECREF(errorHandler);
3836 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003837 Py_DECREF(unicode);
3838 return NULL;
3839}
3840
3841
Alexander Belopolsky40018472011-02-26 01:02:56 +00003842PyObject *
3843PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003844 Py_ssize_t size,
3845 int base64SetO,
3846 int base64WhiteSpace,
3847 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003848{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003849 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003850 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003851 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003852 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003853 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003854 unsigned int base64bits = 0;
3855 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003856 char * out;
3857 char * start;
3858
3859 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003860 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003861
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003862 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003863 return PyErr_NoMemory();
3864
Antoine Pitrou244651a2009-05-04 18:56:13 +00003865 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003866 if (v == NULL)
3867 return NULL;
3868
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003869 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003870 for (;i < size; ++i) {
3871 Py_UNICODE ch = s[i];
3872
Antoine Pitrou244651a2009-05-04 18:56:13 +00003873 if (inShift) {
3874 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3875 /* shifting out */
3876 if (base64bits) { /* output remaining bits */
3877 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3878 base64buffer = 0;
3879 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003880 }
3881 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003882 /* Characters not in the BASE64 set implicitly unshift the sequence
3883 so no '-' is required, except if the character is itself a '-' */
3884 if (IS_BASE64(ch) || ch == '-') {
3885 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003886 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003887 *out++ = (char) ch;
3888 }
3889 else {
3890 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003891 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003892 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003893 else { /* not in a shift sequence */
3894 if (ch == '+') {
3895 *out++ = '+';
3896 *out++ = '-';
3897 }
3898 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3899 *out++ = (char) ch;
3900 }
3901 else {
3902 *out++ = '+';
3903 inShift = 1;
3904 goto encode_char;
3905 }
3906 }
3907 continue;
3908encode_char:
3909#ifdef Py_UNICODE_WIDE
3910 if (ch >= 0x10000) {
3911 /* code first surrogate */
3912 base64bits += 16;
3913 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3914 while (base64bits >= 6) {
3915 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3916 base64bits -= 6;
3917 }
3918 /* prepare second surrogate */
3919 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
3920 }
3921#endif
3922 base64bits += 16;
3923 base64buffer = (base64buffer << 16) | ch;
3924 while (base64bits >= 6) {
3925 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3926 base64bits -= 6;
3927 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00003928 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003929 if (base64bits)
3930 *out++= TO_BASE64(base64buffer << (6-base64bits) );
3931 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003932 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003933 if (_PyBytes_Resize(&v, out - start) < 0)
3934 return NULL;
3935 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003936}
3937
Antoine Pitrou244651a2009-05-04 18:56:13 +00003938#undef IS_BASE64
3939#undef FROM_BASE64
3940#undef TO_BASE64
3941#undef DECODE_DIRECT
3942#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003943
Guido van Rossumd57fd912000-03-10 22:53:23 +00003944/* --- UTF-8 Codec -------------------------------------------------------- */
3945
Tim Petersced69f82003-09-16 20:30:58 +00003946static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003947char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00003948 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
3949 illegal prefix. See RFC 3629 for details */
3950 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3951 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003952 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003953 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3954 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3955 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3956 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00003957 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3958 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003959 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3960 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00003961 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3962 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3963 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3964 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3965 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003966};
3967
Alexander Belopolsky40018472011-02-26 01:02:56 +00003968PyObject *
3969PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003970 Py_ssize_t size,
3971 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003972{
Walter Dörwald69652032004-09-07 20:24:22 +00003973 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3974}
3975
Antoine Pitrouab868312009-01-10 15:40:25 +00003976/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3977#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3978
3979/* Mask to quickly check whether a C 'long' contains a
3980 non-ASCII, UTF8-encoded char. */
3981#if (SIZEOF_LONG == 8)
3982# define ASCII_CHAR_MASK 0x8080808080808080L
3983#elif (SIZEOF_LONG == 4)
3984# define ASCII_CHAR_MASK 0x80808080L
3985#else
3986# error C 'long' size should be either 4 or 8!
3987#endif
3988
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003989/* Scans a UTF-8 string and returns the maximum character to be expected,
3990 the size of the decoded unicode string and if any major errors were
3991 encountered.
3992
3993 This function does check basic UTF-8 sanity, it does however NOT CHECK
3994 if the string contains surrogates, and if all continuation bytes are
3995 within the correct ranges, these checks are performed in
3996 PyUnicode_DecodeUTF8Stateful.
3997
3998 If it sets has_errors to 1, it means the value of unicode_size and max_char
3999 will be bogus and you should not rely on useful information in them.
4000 */
4001static Py_UCS4
4002utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
4003 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
4004 int *has_errors)
4005{
4006 Py_ssize_t n;
4007 Py_ssize_t char_count = 0;
4008 Py_UCS4 max_char = 127, new_max;
4009 Py_UCS4 upper_bound;
4010 const unsigned char *p = (const unsigned char *)s;
4011 const unsigned char *end = p + string_size;
4012 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
4013 int err = 0;
4014
4015 for (; p < end && !err; ++p, ++char_count) {
4016 /* Only check value if it's not a ASCII char... */
4017 if (*p < 0x80) {
4018 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4019 an explanation. */
4020 if (!((size_t) p & LONG_PTR_MASK)) {
4021 /* Help register allocation */
4022 register const unsigned char *_p = p;
4023 while (_p < aligned_end) {
4024 unsigned long value = *(unsigned long *) _p;
4025 if (value & ASCII_CHAR_MASK)
4026 break;
4027 _p += SIZEOF_LONG;
4028 char_count += SIZEOF_LONG;
4029 }
4030 p = _p;
4031 if (p == end)
4032 break;
4033 }
4034 }
4035 if (*p >= 0x80) {
4036 n = utf8_code_length[*p];
4037 new_max = max_char;
4038 switch (n) {
4039 /* invalid start byte */
4040 case 0:
4041 err = 1;
4042 break;
4043 case 2:
4044 /* Code points between 0x00FF and 0x07FF inclusive.
4045 Approximate the upper bound of the code point,
4046 if this flips over 255 we can be sure it will be more
4047 than 255 and the string will need 2 bytes per code coint,
4048 if it stays under or equal to 255, we can be sure 1 byte
4049 is enough.
4050 ((*p & 0b00011111) << 6) | 0b00111111 */
4051 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
4052 if (max_char < upper_bound)
4053 new_max = upper_bound;
4054 /* Ensure we track at least that we left ASCII space. */
4055 if (new_max < 128)
4056 new_max = 128;
4057 break;
4058 case 3:
4059 /* Between 0x0FFF and 0xFFFF inclusive, so values are
4060 always > 255 and <= 65535 and will always need 2 bytes. */
4061 if (max_char < 65535)
4062 new_max = 65535;
4063 break;
4064 case 4:
4065 /* Code point will be above 0xFFFF for sure in this case. */
4066 new_max = 65537;
4067 break;
4068 /* Internal error, this should be caught by the first if */
4069 case 1:
4070 default:
4071 assert(0 && "Impossible case in utf8_max_char_and_size");
4072 err = 1;
4073 }
4074 /* Instead of number of overall bytes for this code point,
Georg Brandl7597add2011-10-05 16:36:47 +02004075 n contains the number of following bytes: */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004076 --n;
4077 /* Check if the follow up chars are all valid continuation bytes */
4078 if (n >= 1) {
4079 const unsigned char *cont;
4080 if ((p + n) >= end) {
4081 if (consumed == 0)
4082 /* incomplete data, non-incremental decoding */
4083 err = 1;
4084 break;
4085 }
4086 for (cont = p + 1; cont < (p + n); ++cont) {
4087 if ((*cont & 0xc0) != 0x80) {
4088 err = 1;
4089 break;
4090 }
4091 }
4092 p += n;
4093 }
4094 else
4095 err = 1;
4096 max_char = new_max;
4097 }
4098 }
4099
4100 if (unicode_size)
4101 *unicode_size = char_count;
4102 if (has_errors)
4103 *has_errors = err;
4104 return max_char;
4105}
4106
4107/* Similar to PyUnicode_WRITE but can also write into wstr field
4108 of the legacy unicode representation */
4109#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
4110 do { \
4111 const int k_ = (kind); \
4112 if (k_ == PyUnicode_WCHAR_KIND) \
4113 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
4114 else if (k_ == PyUnicode_1BYTE_KIND) \
4115 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
4116 else if (k_ == PyUnicode_2BYTE_KIND) \
4117 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
4118 else \
4119 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
4120 } while (0)
4121
Alexander Belopolsky40018472011-02-26 01:02:56 +00004122PyObject *
4123PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004124 Py_ssize_t size,
4125 const char *errors,
4126 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004127{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004128 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004129 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004130 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004131 Py_ssize_t startinpos;
4132 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00004133 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004134 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004135 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004136 PyObject *errorHandler = NULL;
4137 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004138 Py_UCS4 maxchar = 0;
4139 Py_ssize_t unicode_size;
4140 Py_ssize_t i;
4141 int kind;
4142 void *data;
4143 int has_errors;
4144 Py_UNICODE *error_outptr;
4145#if SIZEOF_WCHAR_T == 2
4146 Py_ssize_t wchar_offset = 0;
4147#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004148
Walter Dörwald69652032004-09-07 20:24:22 +00004149 if (size == 0) {
4150 if (consumed)
4151 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004152 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00004153 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004154 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
4155 consumed, &has_errors);
4156 if (has_errors) {
4157 unicode = _PyUnicode_New(size);
4158 if (!unicode)
4159 return NULL;
4160 kind = PyUnicode_WCHAR_KIND;
4161 data = PyUnicode_AS_UNICODE(unicode);
4162 assert(data != NULL);
4163 }
4164 else {
4165 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
4166 if (!unicode)
4167 return NULL;
4168 /* When the string is ASCII only, just use memcpy and return.
4169 unicode_size may be != size if there is an incomplete UTF-8
4170 sequence at the end of the ASCII block. */
4171 if (maxchar < 128 && size == unicode_size) {
4172 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
4173 return (PyObject *)unicode;
4174 }
4175 kind = PyUnicode_KIND(unicode);
4176 data = PyUnicode_DATA(unicode);
4177 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004178 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004179 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004180 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00004181 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004182
4183 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004184 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004185
4186 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004187 /* Fast path for runs of ASCII characters. Given that common UTF-8
4188 input will consist of an overwhelming majority of ASCII
4189 characters, we try to optimize for this case by checking
4190 as many characters as a C 'long' can contain.
4191 First, check if we can do an aligned read, as most CPUs have
4192 a penalty for unaligned reads.
4193 */
4194 if (!((size_t) s & LONG_PTR_MASK)) {
4195 /* Help register allocation */
4196 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004197 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004198 while (_s < aligned_end) {
4199 /* Read a whole long at a time (either 4 or 8 bytes),
4200 and do a fast unrolled copy if it only contains ASCII
4201 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004202 unsigned long value = *(unsigned long *) _s;
4203 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004204 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004205 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
4206 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
4207 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
4208 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004209#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004210 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
4211 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
4212 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
4213 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004214#endif
4215 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004216 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004217 }
4218 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004219 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004220 if (s == e)
4221 break;
4222 ch = (unsigned char)*s;
4223 }
4224 }
4225
4226 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004227 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004228 s++;
4229 continue;
4230 }
4231
4232 n = utf8_code_length[ch];
4233
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004234 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004235 if (consumed)
4236 break;
4237 else {
4238 errmsg = "unexpected end of data";
4239 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004240 endinpos = startinpos+1;
4241 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4242 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004243 goto utf8Error;
4244 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004245 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004246
4247 switch (n) {
4248
4249 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004250 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004251 startinpos = s-starts;
4252 endinpos = startinpos+1;
4253 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004254
4255 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004256 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004257 startinpos = s-starts;
4258 endinpos = startinpos+1;
4259 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004260
4261 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004262 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004263 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004264 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004265 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004266 goto utf8Error;
4267 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004268 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004269 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004270 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004271 break;
4272
4273 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004274 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4275 will result in surrogates in range d800-dfff. Surrogates are
4276 not valid UTF-8 so they are rejected.
4277 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4278 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004279 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004280 (s[2] & 0xc0) != 0x80 ||
4281 ((unsigned char)s[0] == 0xE0 &&
4282 (unsigned char)s[1] < 0xA0) ||
4283 ((unsigned char)s[0] == 0xED &&
4284 (unsigned char)s[1] > 0x9F)) {
4285 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004286 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004287 endinpos = startinpos + 1;
4288
4289 /* if s[1] first two bits are 1 and 0, then the invalid
4290 continuation byte is s[2], so increment endinpos by 1,
4291 if not, s[1] is invalid and endinpos doesn't need to
4292 be incremented. */
4293 if ((s[1] & 0xC0) == 0x80)
4294 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004295 goto utf8Error;
4296 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004297 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004298 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004299 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004300 break;
4301
4302 case 4:
4303 if ((s[1] & 0xc0) != 0x80 ||
4304 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004305 (s[3] & 0xc0) != 0x80 ||
4306 ((unsigned char)s[0] == 0xF0 &&
4307 (unsigned char)s[1] < 0x90) ||
4308 ((unsigned char)s[0] == 0xF4 &&
4309 (unsigned char)s[1] > 0x8F)) {
4310 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004311 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004312 endinpos = startinpos + 1;
4313 if ((s[1] & 0xC0) == 0x80) {
4314 endinpos++;
4315 if ((s[2] & 0xC0) == 0x80)
4316 endinpos++;
4317 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004318 goto utf8Error;
4319 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004320 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004321 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4322 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4323
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004324 /* If the string is flexible or we have native UCS-4, write
4325 directly.. */
4326 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
4327 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00004328
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004329 else {
4330 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00004331
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004332 /* translate from 10000..10FFFF to 0..FFFF */
4333 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00004334
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004335 /* high surrogate = top 10 bits added to D800 */
4336 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4337 (Py_UNICODE)(0xD800 + (ch >> 10)));
4338
4339 /* low surrogate = bottom 10 bits added to DC00 */
4340 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4341 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
4342 }
4343#if SIZEOF_WCHAR_T == 2
4344 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004345#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004346 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004347 }
4348 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004349 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004350
Benjamin Peterson29060642009-01-31 22:14:21 +00004351 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004352 /* If this is not yet a resizable string, make it one.. */
4353 if (kind != PyUnicode_WCHAR_KIND) {
4354 const Py_UNICODE *u;
4355 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
4356 if (!new_unicode)
4357 goto onError;
4358 u = PyUnicode_AsUnicode((PyObject *)unicode);
4359 if (!u)
4360 goto onError;
4361#if SIZEOF_WCHAR_T == 2
4362 i += wchar_offset;
4363#endif
4364 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
4365 Py_DECREF(unicode);
4366 unicode = new_unicode;
4367 kind = 0;
4368 data = PyUnicode_AS_UNICODE(new_unicode);
4369 assert(data != NULL);
4370 }
4371 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00004372 if (unicode_decode_call_errorhandler(
4373 errors, &errorHandler,
4374 "utf8", errmsg,
4375 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004376 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00004377 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004378 /* Update data because unicode_decode_call_errorhandler might have
4379 re-created or resized the unicode object. */
4380 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004381 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004382 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004383 /* Ensure the unicode_size calculation above was correct: */
4384 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
4385
Walter Dörwald69652032004-09-07 20:24:22 +00004386 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004387 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004388
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004389 /* Adjust length and ready string when it contained errors and
4390 is of the old resizable kind. */
4391 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004392 if (PyUnicode_Resize((PyObject**)&unicode, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004393 goto onError;
4394 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004395
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004396 Py_XDECREF(errorHandler);
4397 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02004398#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004399 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004400 Py_DECREF(unicode);
4401 return NULL;
4402 }
Victor Stinner17efeed2011-10-04 20:05:46 +02004403#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004404 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004405 return (PyObject *)unicode;
4406
Benjamin Peterson29060642009-01-31 22:14:21 +00004407 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004408 Py_XDECREF(errorHandler);
4409 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004410 Py_DECREF(unicode);
4411 return NULL;
4412}
4413
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004414#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00004415
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004416#ifdef __APPLE__
4417
4418/* Simplified UTF-8 decoder using surrogateescape error handler,
4419 used to decode the command line arguments on Mac OS X. */
4420
4421wchar_t*
4422_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4423{
4424 int n;
4425 const char *e;
4426 wchar_t *unicode, *p;
4427
4428 /* Note: size will always be longer than the resulting Unicode
4429 character count */
4430 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4431 PyErr_NoMemory();
4432 return NULL;
4433 }
4434 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4435 if (!unicode)
4436 return NULL;
4437
4438 /* Unpack UTF-8 encoded data */
4439 p = unicode;
4440 e = s + size;
4441 while (s < e) {
4442 Py_UCS4 ch = (unsigned char)*s;
4443
4444 if (ch < 0x80) {
4445 *p++ = (wchar_t)ch;
4446 s++;
4447 continue;
4448 }
4449
4450 n = utf8_code_length[ch];
4451 if (s + n > e) {
4452 goto surrogateescape;
4453 }
4454
4455 switch (n) {
4456 case 0:
4457 case 1:
4458 goto surrogateescape;
4459
4460 case 2:
4461 if ((s[1] & 0xc0) != 0x80)
4462 goto surrogateescape;
4463 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4464 assert ((ch > 0x007F) && (ch <= 0x07FF));
4465 *p++ = (wchar_t)ch;
4466 break;
4467
4468 case 3:
4469 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4470 will result in surrogates in range d800-dfff. Surrogates are
4471 not valid UTF-8 so they are rejected.
4472 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4473 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4474 if ((s[1] & 0xc0) != 0x80 ||
4475 (s[2] & 0xc0) != 0x80 ||
4476 ((unsigned char)s[0] == 0xE0 &&
4477 (unsigned char)s[1] < 0xA0) ||
4478 ((unsigned char)s[0] == 0xED &&
4479 (unsigned char)s[1] > 0x9F)) {
4480
4481 goto surrogateescape;
4482 }
4483 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4484 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004485 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004486 break;
4487
4488 case 4:
4489 if ((s[1] & 0xc0) != 0x80 ||
4490 (s[2] & 0xc0) != 0x80 ||
4491 (s[3] & 0xc0) != 0x80 ||
4492 ((unsigned char)s[0] == 0xF0 &&
4493 (unsigned char)s[1] < 0x90) ||
4494 ((unsigned char)s[0] == 0xF4 &&
4495 (unsigned char)s[1] > 0x8F)) {
4496 goto surrogateescape;
4497 }
4498 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4499 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4500 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4501
4502#if SIZEOF_WCHAR_T == 4
4503 *p++ = (wchar_t)ch;
4504#else
4505 /* compute and append the two surrogates: */
4506
4507 /* translate from 10000..10FFFF to 0..FFFF */
4508 ch -= 0x10000;
4509
4510 /* high surrogate = top 10 bits added to D800 */
4511 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4512
4513 /* low surrogate = bottom 10 bits added to DC00 */
4514 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4515#endif
4516 break;
4517 }
4518 s += n;
4519 continue;
4520
4521 surrogateescape:
4522 *p++ = 0xDC00 + ch;
4523 s++;
4524 }
4525 *p = L'\0';
4526 return unicode;
4527}
4528
4529#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004530
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004531/* Primary internal function which creates utf8 encoded bytes objects.
4532
4533 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004534 and allocate exactly as much space needed at the end. Else allocate the
4535 maximum possible needed (4 result bytes per Unicode character), and return
4536 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004537*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004538PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004539_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004540{
Tim Peters602f7402002-04-27 18:03:26 +00004541#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004542
Guido van Rossum98297ee2007-11-06 21:34:58 +00004543 Py_ssize_t i; /* index into s of next input byte */
4544 PyObject *result; /* result string object */
4545 char *p; /* next free byte in output buffer */
4546 Py_ssize_t nallocated; /* number of result bytes allocated */
4547 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004548 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004549 PyObject *errorHandler = NULL;
4550 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004551 int kind;
4552 void *data;
4553 Py_ssize_t size;
4554 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4555#if SIZEOF_WCHAR_T == 2
4556 Py_ssize_t wchar_offset = 0;
4557#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004558
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004559 if (!PyUnicode_Check(unicode)) {
4560 PyErr_BadArgument();
4561 return NULL;
4562 }
4563
4564 if (PyUnicode_READY(unicode) == -1)
4565 return NULL;
4566
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004567 if (PyUnicode_UTF8(unicode))
4568 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4569 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004570
4571 kind = PyUnicode_KIND(unicode);
4572 data = PyUnicode_DATA(unicode);
4573 size = PyUnicode_GET_LENGTH(unicode);
4574
Tim Peters602f7402002-04-27 18:03:26 +00004575 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004576
Tim Peters602f7402002-04-27 18:03:26 +00004577 if (size <= MAX_SHORT_UNICHARS) {
4578 /* Write into the stack buffer; nallocated can't overflow.
4579 * At the end, we'll allocate exactly as much heap space as it
4580 * turns out we need.
4581 */
4582 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004583 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004584 p = stackbuf;
4585 }
4586 else {
4587 /* Overallocate on the heap, and give the excess back at the end. */
4588 nallocated = size * 4;
4589 if (nallocated / 4 != size) /* overflow! */
4590 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004591 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004592 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004593 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004594 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004595 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004596
Tim Peters602f7402002-04-27 18:03:26 +00004597 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004598 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004599
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004600 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004601 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004602 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004603
Guido van Rossumd57fd912000-03-10 22:53:23 +00004604 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004605 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004606 *p++ = (char)(0xc0 | (ch >> 6));
4607 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004608 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004609 Py_ssize_t newpos;
4610 PyObject *rep;
4611 Py_ssize_t repsize, k, startpos;
4612 startpos = i-1;
4613#if SIZEOF_WCHAR_T == 2
4614 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004615#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004616 rep = unicode_encode_call_errorhandler(
4617 errors, &errorHandler, "utf-8", "surrogates not allowed",
4618 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4619 &exc, startpos, startpos+1, &newpos);
4620 if (!rep)
4621 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004622
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004623 if (PyBytes_Check(rep))
4624 repsize = PyBytes_GET_SIZE(rep);
4625 else
4626 repsize = PyUnicode_GET_SIZE(rep);
4627
4628 if (repsize > 4) {
4629 Py_ssize_t offset;
4630
4631 if (result == NULL)
4632 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004633 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004634 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004635
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004636 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4637 /* integer overflow */
4638 PyErr_NoMemory();
4639 goto error;
4640 }
4641 nallocated += repsize - 4;
4642 if (result != NULL) {
4643 if (_PyBytes_Resize(&result, nallocated) < 0)
4644 goto error;
4645 } else {
4646 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004647 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004648 goto error;
4649 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4650 }
4651 p = PyBytes_AS_STRING(result) + offset;
4652 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004653
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004654 if (PyBytes_Check(rep)) {
4655 char *prep = PyBytes_AS_STRING(rep);
4656 for(k = repsize; k > 0; k--)
4657 *p++ = *prep++;
4658 } else /* rep is unicode */ {
4659 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4660 Py_UNICODE c;
4661
4662 for(k=0; k<repsize; k++) {
4663 c = prep[k];
4664 if (0x80 <= c) {
4665 raise_encode_exception(&exc, "utf-8",
4666 PyUnicode_AS_UNICODE(unicode),
4667 size, i-1, i,
4668 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004669 goto error;
4670 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004671 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004672 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004673 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004674 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004675 } else if (ch < 0x10000) {
4676 *p++ = (char)(0xe0 | (ch >> 12));
4677 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4678 *p++ = (char)(0x80 | (ch & 0x3f));
4679 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004680 /* Encode UCS4 Unicode ordinals */
4681 *p++ = (char)(0xf0 | (ch >> 18));
4682 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4683 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4684 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004685#if SIZEOF_WCHAR_T == 2
4686 wchar_offset++;
4687#endif
Tim Peters602f7402002-04-27 18:03:26 +00004688 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004689 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004690
Guido van Rossum98297ee2007-11-06 21:34:58 +00004691 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004692 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004693 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004694 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004695 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004696 }
4697 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004698 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004699 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004700 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004701 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004702 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004703
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004704 Py_XDECREF(errorHandler);
4705 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004706 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004707 error:
4708 Py_XDECREF(errorHandler);
4709 Py_XDECREF(exc);
4710 Py_XDECREF(result);
4711 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004712
Tim Peters602f7402002-04-27 18:03:26 +00004713#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004714}
4715
Alexander Belopolsky40018472011-02-26 01:02:56 +00004716PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004717PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4718 Py_ssize_t size,
4719 const char *errors)
4720{
4721 PyObject *v, *unicode;
4722
4723 unicode = PyUnicode_FromUnicode(s, size);
4724 if (unicode == NULL)
4725 return NULL;
4726 v = _PyUnicode_AsUTF8String(unicode, errors);
4727 Py_DECREF(unicode);
4728 return v;
4729}
4730
4731PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004732PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004733{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004734 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004735}
4736
Walter Dörwald41980ca2007-08-16 21:55:45 +00004737/* --- UTF-32 Codec ------------------------------------------------------- */
4738
4739PyObject *
4740PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004741 Py_ssize_t size,
4742 const char *errors,
4743 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004744{
4745 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4746}
4747
4748PyObject *
4749PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004750 Py_ssize_t size,
4751 const char *errors,
4752 int *byteorder,
4753 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004754{
4755 const char *starts = s;
4756 Py_ssize_t startinpos;
4757 Py_ssize_t endinpos;
4758 Py_ssize_t outpos;
4759 PyUnicodeObject *unicode;
4760 Py_UNICODE *p;
4761#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004762 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004763 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004764#else
4765 const int pairs = 0;
4766#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004767 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004768 int bo = 0; /* assume native ordering by default */
4769 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004770 /* Offsets from q for retrieving bytes in the right order. */
4771#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4772 int iorder[] = {0, 1, 2, 3};
4773#else
4774 int iorder[] = {3, 2, 1, 0};
4775#endif
4776 PyObject *errorHandler = NULL;
4777 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004778
Walter Dörwald41980ca2007-08-16 21:55:45 +00004779 q = (unsigned char *)s;
4780 e = q + size;
4781
4782 if (byteorder)
4783 bo = *byteorder;
4784
4785 /* Check for BOM marks (U+FEFF) in the input and adjust current
4786 byte order setting accordingly. In native mode, the leading BOM
4787 mark is skipped, in all other modes, it is copied to the output
4788 stream as-is (giving a ZWNBSP character). */
4789 if (bo == 0) {
4790 if (size >= 4) {
4791 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004792 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004793#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004794 if (bom == 0x0000FEFF) {
4795 q += 4;
4796 bo = -1;
4797 }
4798 else if (bom == 0xFFFE0000) {
4799 q += 4;
4800 bo = 1;
4801 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004802#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004803 if (bom == 0x0000FEFF) {
4804 q += 4;
4805 bo = 1;
4806 }
4807 else if (bom == 0xFFFE0000) {
4808 q += 4;
4809 bo = -1;
4810 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004811#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004812 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004813 }
4814
4815 if (bo == -1) {
4816 /* force LE */
4817 iorder[0] = 0;
4818 iorder[1] = 1;
4819 iorder[2] = 2;
4820 iorder[3] = 3;
4821 }
4822 else if (bo == 1) {
4823 /* force BE */
4824 iorder[0] = 3;
4825 iorder[1] = 2;
4826 iorder[2] = 1;
4827 iorder[3] = 0;
4828 }
4829
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004830 /* On narrow builds we split characters outside the BMP into two
4831 codepoints => count how much extra space we need. */
4832#ifndef Py_UNICODE_WIDE
4833 for (qq = q; qq < e; qq += 4)
4834 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4835 pairs++;
4836#endif
4837
4838 /* This might be one to much, because of a BOM */
4839 unicode = _PyUnicode_New((size+3)/4+pairs);
4840 if (!unicode)
4841 return NULL;
4842 if (size == 0)
4843 return (PyObject *)unicode;
4844
4845 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004846 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004847
Walter Dörwald41980ca2007-08-16 21:55:45 +00004848 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004849 Py_UCS4 ch;
4850 /* remaining bytes at the end? (size should be divisible by 4) */
4851 if (e-q<4) {
4852 if (consumed)
4853 break;
4854 errmsg = "truncated data";
4855 startinpos = ((const char *)q)-starts;
4856 endinpos = ((const char *)e)-starts;
4857 goto utf32Error;
4858 /* The remaining input chars are ignored if the callback
4859 chooses to skip the input */
4860 }
4861 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4862 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004863
Benjamin Peterson29060642009-01-31 22:14:21 +00004864 if (ch >= 0x110000)
4865 {
4866 errmsg = "codepoint not in range(0x110000)";
4867 startinpos = ((const char *)q)-starts;
4868 endinpos = startinpos+4;
4869 goto utf32Error;
4870 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004871#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004872 if (ch >= 0x10000)
4873 {
4874 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4875 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4876 }
4877 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004878#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004879 *p++ = ch;
4880 q += 4;
4881 continue;
4882 utf32Error:
4883 outpos = p-PyUnicode_AS_UNICODE(unicode);
4884 if (unicode_decode_call_errorhandler(
4885 errors, &errorHandler,
4886 "utf32", errmsg,
4887 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4888 &unicode, &outpos, &p))
4889 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004890 }
4891
4892 if (byteorder)
4893 *byteorder = bo;
4894
4895 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004896 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004897
4898 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02004899 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004900 goto onError;
4901
4902 Py_XDECREF(errorHandler);
4903 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02004904#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004905 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004906 Py_DECREF(unicode);
4907 return NULL;
4908 }
Victor Stinner17efeed2011-10-04 20:05:46 +02004909#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004910 assert(_PyUnicode_CheckConsistency(unicode, 1));
Walter Dörwald41980ca2007-08-16 21:55:45 +00004911 return (PyObject *)unicode;
4912
Benjamin Peterson29060642009-01-31 22:14:21 +00004913 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004914 Py_DECREF(unicode);
4915 Py_XDECREF(errorHandler);
4916 Py_XDECREF(exc);
4917 return NULL;
4918}
4919
4920PyObject *
4921PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004922 Py_ssize_t size,
4923 const char *errors,
4924 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004925{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004926 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004927 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004928 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004929#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004930 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004931#else
4932 const int pairs = 0;
4933#endif
4934 /* Offsets from p for storing byte pairs in the right order. */
4935#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4936 int iorder[] = {0, 1, 2, 3};
4937#else
4938 int iorder[] = {3, 2, 1, 0};
4939#endif
4940
Benjamin Peterson29060642009-01-31 22:14:21 +00004941#define STORECHAR(CH) \
4942 do { \
4943 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4944 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4945 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4946 p[iorder[0]] = (CH) & 0xff; \
4947 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004948 } while(0)
4949
4950 /* In narrow builds we can output surrogate pairs as one codepoint,
4951 so we need less space. */
4952#ifndef Py_UNICODE_WIDE
4953 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004954 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4955 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4956 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004957#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004958 nsize = (size - pairs + (byteorder == 0));
4959 bytesize = nsize * 4;
4960 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004961 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004962 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004963 if (v == NULL)
4964 return NULL;
4965
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004966 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004967 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004968 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004969 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004970 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004971
4972 if (byteorder == -1) {
4973 /* force LE */
4974 iorder[0] = 0;
4975 iorder[1] = 1;
4976 iorder[2] = 2;
4977 iorder[3] = 3;
4978 }
4979 else if (byteorder == 1) {
4980 /* force BE */
4981 iorder[0] = 3;
4982 iorder[1] = 2;
4983 iorder[2] = 1;
4984 iorder[3] = 0;
4985 }
4986
4987 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004988 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004989#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004990 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4991 Py_UCS4 ch2 = *s;
4992 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4993 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4994 s++;
4995 size--;
4996 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004997 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004998#endif
4999 STORECHAR(ch);
5000 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005001
5002 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005003 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005004#undef STORECHAR
5005}
5006
Alexander Belopolsky40018472011-02-26 01:02:56 +00005007PyObject *
5008PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005009{
5010 if (!PyUnicode_Check(unicode)) {
5011 PyErr_BadArgument();
5012 return NULL;
5013 }
5014 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005015 PyUnicode_GET_SIZE(unicode),
5016 NULL,
5017 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005018}
5019
Guido van Rossumd57fd912000-03-10 22:53:23 +00005020/* --- UTF-16 Codec ------------------------------------------------------- */
5021
Tim Peters772747b2001-08-09 22:21:55 +00005022PyObject *
5023PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005024 Py_ssize_t size,
5025 const char *errors,
5026 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005027{
Walter Dörwald69652032004-09-07 20:24:22 +00005028 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5029}
5030
Antoine Pitrouab868312009-01-10 15:40:25 +00005031/* Two masks for fast checking of whether a C 'long' may contain
5032 UTF16-encoded surrogate characters. This is an efficient heuristic,
5033 assuming that non-surrogate characters with a code point >= 0x8000 are
5034 rare in most input.
5035 FAST_CHAR_MASK is used when the input is in native byte ordering,
5036 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005037*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005038#if (SIZEOF_LONG == 8)
5039# define FAST_CHAR_MASK 0x8000800080008000L
5040# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
5041#elif (SIZEOF_LONG == 4)
5042# define FAST_CHAR_MASK 0x80008000L
5043# define SWAPPED_FAST_CHAR_MASK 0x00800080L
5044#else
5045# error C 'long' size should be either 4 or 8!
5046#endif
5047
Walter Dörwald69652032004-09-07 20:24:22 +00005048PyObject *
5049PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005050 Py_ssize_t size,
5051 const char *errors,
5052 int *byteorder,
5053 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005054{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005055 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005056 Py_ssize_t startinpos;
5057 Py_ssize_t endinpos;
5058 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005059 PyUnicodeObject *unicode;
5060 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00005061 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005062 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005063 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005064 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005065 /* Offsets from q for retrieving byte pairs in the right order. */
5066#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5067 int ihi = 1, ilo = 0;
5068#else
5069 int ihi = 0, ilo = 1;
5070#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005071 PyObject *errorHandler = NULL;
5072 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005073
5074 /* Note: size will always be longer than the resulting Unicode
5075 character count */
5076 unicode = _PyUnicode_New(size);
5077 if (!unicode)
5078 return NULL;
5079 if (size == 0)
5080 return (PyObject *)unicode;
5081
5082 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005083 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00005084 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005085 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005086
5087 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005088 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005089
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005090 /* Check for BOM marks (U+FEFF) in the input and adjust current
5091 byte order setting accordingly. In native mode, the leading BOM
5092 mark is skipped, in all other modes, it is copied to the output
5093 stream as-is (giving a ZWNBSP character). */
5094 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005095 if (size >= 2) {
5096 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005097#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005098 if (bom == 0xFEFF) {
5099 q += 2;
5100 bo = -1;
5101 }
5102 else if (bom == 0xFFFE) {
5103 q += 2;
5104 bo = 1;
5105 }
Tim Petersced69f82003-09-16 20:30:58 +00005106#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005107 if (bom == 0xFEFF) {
5108 q += 2;
5109 bo = 1;
5110 }
5111 else if (bom == 0xFFFE) {
5112 q += 2;
5113 bo = -1;
5114 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005115#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005116 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005117 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005118
Tim Peters772747b2001-08-09 22:21:55 +00005119 if (bo == -1) {
5120 /* force LE */
5121 ihi = 1;
5122 ilo = 0;
5123 }
5124 else if (bo == 1) {
5125 /* force BE */
5126 ihi = 0;
5127 ilo = 1;
5128 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005129#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5130 native_ordering = ilo < ihi;
5131#else
5132 native_ordering = ilo > ihi;
5133#endif
Tim Peters772747b2001-08-09 22:21:55 +00005134
Antoine Pitrouab868312009-01-10 15:40:25 +00005135 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005136 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005137 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005138 /* First check for possible aligned read of a C 'long'. Unaligned
5139 reads are more expensive, better to defer to another iteration. */
5140 if (!((size_t) q & LONG_PTR_MASK)) {
5141 /* Fast path for runs of non-surrogate chars. */
5142 register const unsigned char *_q = q;
5143 Py_UNICODE *_p = p;
5144 if (native_ordering) {
5145 /* Native ordering is simple: as long as the input cannot
5146 possibly contain a surrogate char, do an unrolled copy
5147 of several 16-bit code points to the target object.
5148 The non-surrogate check is done on several input bytes
5149 at a time (as many as a C 'long' can contain). */
5150 while (_q < aligned_end) {
5151 unsigned long data = * (unsigned long *) _q;
5152 if (data & FAST_CHAR_MASK)
5153 break;
5154 _p[0] = ((unsigned short *) _q)[0];
5155 _p[1] = ((unsigned short *) _q)[1];
5156#if (SIZEOF_LONG == 8)
5157 _p[2] = ((unsigned short *) _q)[2];
5158 _p[3] = ((unsigned short *) _q)[3];
5159#endif
5160 _q += SIZEOF_LONG;
5161 _p += SIZEOF_LONG / 2;
5162 }
5163 }
5164 else {
5165 /* Byteswapped ordering is similar, but we must decompose
5166 the copy bytewise, and take care of zero'ing out the
5167 upper bytes if the target object is in 32-bit units
5168 (that is, in UCS-4 builds). */
5169 while (_q < aligned_end) {
5170 unsigned long data = * (unsigned long *) _q;
5171 if (data & SWAPPED_FAST_CHAR_MASK)
5172 break;
5173 /* Zero upper bytes in UCS-4 builds */
5174#if (Py_UNICODE_SIZE > 2)
5175 _p[0] = 0;
5176 _p[1] = 0;
5177#if (SIZEOF_LONG == 8)
5178 _p[2] = 0;
5179 _p[3] = 0;
5180#endif
5181#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005182 /* Issue #4916; UCS-4 builds on big endian machines must
5183 fill the two last bytes of each 4-byte unit. */
5184#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
5185# define OFF 2
5186#else
5187# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00005188#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005189 ((unsigned char *) _p)[OFF + 1] = _q[0];
5190 ((unsigned char *) _p)[OFF + 0] = _q[1];
5191 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
5192 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
5193#if (SIZEOF_LONG == 8)
5194 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
5195 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
5196 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
5197 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
5198#endif
5199#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00005200 _q += SIZEOF_LONG;
5201 _p += SIZEOF_LONG / 2;
5202 }
5203 }
5204 p = _p;
5205 q = _q;
5206 if (q >= e)
5207 break;
5208 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005209 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005210
Benjamin Peterson14339b62009-01-31 16:36:08 +00005211 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005212
5213 if (ch < 0xD800 || ch > 0xDFFF) {
5214 *p++ = ch;
5215 continue;
5216 }
5217
5218 /* UTF-16 code pair: */
5219 if (q > e) {
5220 errmsg = "unexpected end of data";
5221 startinpos = (((const char *)q) - 2) - starts;
5222 endinpos = ((const char *)e) + 1 - starts;
5223 goto utf16Error;
5224 }
5225 if (0xD800 <= ch && ch <= 0xDBFF) {
5226 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
5227 q += 2;
5228 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00005229#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005230 *p++ = ch;
5231 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005232#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005233 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005234#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005235 continue;
5236 }
5237 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005238 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005239 startinpos = (((const char *)q)-4)-starts;
5240 endinpos = startinpos+2;
5241 goto utf16Error;
5242 }
5243
Benjamin Peterson14339b62009-01-31 16:36:08 +00005244 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005245 errmsg = "illegal encoding";
5246 startinpos = (((const char *)q)-2)-starts;
5247 endinpos = startinpos+2;
5248 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005249
Benjamin Peterson29060642009-01-31 22:14:21 +00005250 utf16Error:
5251 outpos = p - PyUnicode_AS_UNICODE(unicode);
5252 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005253 errors,
5254 &errorHandler,
5255 "utf16", errmsg,
5256 &starts,
5257 (const char **)&e,
5258 &startinpos,
5259 &endinpos,
5260 &exc,
5261 (const char **)&q,
5262 &unicode,
5263 &outpos,
5264 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00005265 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005266 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005267 /* remaining byte at the end? (size should be even) */
5268 if (e == q) {
5269 if (!consumed) {
5270 errmsg = "truncated data";
5271 startinpos = ((const char *)q) - starts;
5272 endinpos = ((const char *)e) + 1 - starts;
5273 outpos = p - PyUnicode_AS_UNICODE(unicode);
5274 if (unicode_decode_call_errorhandler(
5275 errors,
5276 &errorHandler,
5277 "utf16", errmsg,
5278 &starts,
5279 (const char **)&e,
5280 &startinpos,
5281 &endinpos,
5282 &exc,
5283 (const char **)&q,
5284 &unicode,
5285 &outpos,
5286 &p))
5287 goto onError;
5288 /* The remaining input chars are ignored if the callback
5289 chooses to skip the input */
5290 }
5291 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005292
5293 if (byteorder)
5294 *byteorder = bo;
5295
Walter Dörwald69652032004-09-07 20:24:22 +00005296 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005297 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005298
Guido van Rossumd57fd912000-03-10 22:53:23 +00005299 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02005300 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005301 goto onError;
5302
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005303 Py_XDECREF(errorHandler);
5304 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005305#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005306 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005307 Py_DECREF(unicode);
5308 return NULL;
5309 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005310#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005311 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005312 return (PyObject *)unicode;
5313
Benjamin Peterson29060642009-01-31 22:14:21 +00005314 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005315 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005316 Py_XDECREF(errorHandler);
5317 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005318 return NULL;
5319}
5320
Antoine Pitrouab868312009-01-10 15:40:25 +00005321#undef FAST_CHAR_MASK
5322#undef SWAPPED_FAST_CHAR_MASK
5323
Tim Peters772747b2001-08-09 22:21:55 +00005324PyObject *
5325PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005326 Py_ssize_t size,
5327 const char *errors,
5328 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005329{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005330 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005331 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005332 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005333#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005334 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005335#else
5336 const int pairs = 0;
5337#endif
Tim Peters772747b2001-08-09 22:21:55 +00005338 /* Offsets from p for storing byte pairs in the right order. */
5339#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5340 int ihi = 1, ilo = 0;
5341#else
5342 int ihi = 0, ilo = 1;
5343#endif
5344
Benjamin Peterson29060642009-01-31 22:14:21 +00005345#define STORECHAR(CH) \
5346 do { \
5347 p[ihi] = ((CH) >> 8) & 0xff; \
5348 p[ilo] = (CH) & 0xff; \
5349 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005350 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005351
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005352#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005353 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005354 if (s[i] >= 0x10000)
5355 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005356#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005357 /* 2 * (size + pairs + (byteorder == 0)) */
5358 if (size > PY_SSIZE_T_MAX ||
5359 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005360 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005361 nsize = size + pairs + (byteorder == 0);
5362 bytesize = nsize * 2;
5363 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005364 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005365 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005366 if (v == NULL)
5367 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005368
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005369 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005370 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005371 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005372 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005373 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005374
5375 if (byteorder == -1) {
5376 /* force LE */
5377 ihi = 1;
5378 ilo = 0;
5379 }
5380 else if (byteorder == 1) {
5381 /* force BE */
5382 ihi = 0;
5383 ilo = 1;
5384 }
5385
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005386 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005387 Py_UNICODE ch = *s++;
5388 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005389#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005390 if (ch >= 0x10000) {
5391 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5392 ch = 0xD800 | ((ch-0x10000) >> 10);
5393 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005394#endif
Tim Peters772747b2001-08-09 22:21:55 +00005395 STORECHAR(ch);
5396 if (ch2)
5397 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005398 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005399
5400 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005401 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005402#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005403}
5404
Alexander Belopolsky40018472011-02-26 01:02:56 +00005405PyObject *
5406PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005407{
5408 if (!PyUnicode_Check(unicode)) {
5409 PyErr_BadArgument();
5410 return NULL;
5411 }
5412 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005413 PyUnicode_GET_SIZE(unicode),
5414 NULL,
5415 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005416}
5417
5418/* --- Unicode Escape Codec ----------------------------------------------- */
5419
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005420/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5421 if all the escapes in the string make it still a valid ASCII string.
5422 Returns -1 if any escapes were found which cause the string to
5423 pop out of ASCII range. Otherwise returns the length of the
5424 required buffer to hold the string.
5425 */
5426Py_ssize_t
5427length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5428{
5429 const unsigned char *p = (const unsigned char *)s;
5430 const unsigned char *end = p + size;
5431 Py_ssize_t length = 0;
5432
5433 if (size < 0)
5434 return -1;
5435
5436 for (; p < end; ++p) {
5437 if (*p > 127) {
5438 /* Non-ASCII */
5439 return -1;
5440 }
5441 else if (*p != '\\') {
5442 /* Normal character */
5443 ++length;
5444 }
5445 else {
5446 /* Backslash-escape, check next char */
5447 ++p;
5448 /* Escape sequence reaches till end of string or
5449 non-ASCII follow-up. */
5450 if (p >= end || *p > 127)
5451 return -1;
5452 switch (*p) {
5453 case '\n':
5454 /* backslash + \n result in zero characters */
5455 break;
5456 case '\\': case '\'': case '\"':
5457 case 'b': case 'f': case 't':
5458 case 'n': case 'r': case 'v': case 'a':
5459 ++length;
5460 break;
5461 case '0': case '1': case '2': case '3':
5462 case '4': case '5': case '6': case '7':
5463 case 'x': case 'u': case 'U': case 'N':
5464 /* these do not guarantee ASCII characters */
5465 return -1;
5466 default:
5467 /* count the backslash + the other character */
5468 length += 2;
5469 }
5470 }
5471 }
5472 return length;
5473}
5474
5475/* Similar to PyUnicode_WRITE but either write into wstr field
5476 or treat string as ASCII. */
5477#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5478 do { \
5479 if ((kind) != PyUnicode_WCHAR_KIND) \
5480 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5481 else \
5482 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5483 } while (0)
5484
5485#define WRITE_WSTR(buf, index, value) \
5486 assert(kind == PyUnicode_WCHAR_KIND), \
5487 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5488
5489
Fredrik Lundh06d12682001-01-24 07:59:11 +00005490static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005491
Alexander Belopolsky40018472011-02-26 01:02:56 +00005492PyObject *
5493PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005494 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005495 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005496{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005497 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005498 Py_ssize_t startinpos;
5499 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005500 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005501 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005502 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005503 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005504 char* message;
5505 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005506 PyObject *errorHandler = NULL;
5507 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005508 Py_ssize_t ascii_length;
5509 Py_ssize_t i;
5510 int kind;
5511 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005512
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005513 ascii_length = length_of_escaped_ascii_string(s, size);
5514
5515 /* After length_of_escaped_ascii_string() there are two alternatives,
5516 either the string is pure ASCII with named escapes like \n, etc.
5517 and we determined it's exact size (common case)
5518 or it contains \x, \u, ... escape sequences. then we create a
5519 legacy wchar string and resize it at the end of this function. */
5520 if (ascii_length >= 0) {
5521 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
5522 if (!v)
5523 goto onError;
5524 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5525 kind = PyUnicode_1BYTE_KIND;
5526 data = PyUnicode_DATA(v);
5527 }
5528 else {
5529 /* Escaped strings will always be longer than the resulting
5530 Unicode string, so we start with size here and then reduce the
5531 length after conversion to the true value.
5532 (but if the error callback returns a long replacement string
5533 we'll have to allocate more space) */
5534 v = _PyUnicode_New(size);
5535 if (!v)
5536 goto onError;
5537 kind = PyUnicode_WCHAR_KIND;
5538 data = PyUnicode_AS_UNICODE(v);
5539 }
5540
Guido van Rossumd57fd912000-03-10 22:53:23 +00005541 if (size == 0)
5542 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005543 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005544 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005545
Guido van Rossumd57fd912000-03-10 22:53:23 +00005546 while (s < end) {
5547 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005548 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005549 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005550
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005551 if (kind == PyUnicode_WCHAR_KIND) {
5552 assert(i < _PyUnicode_WSTR_LENGTH(v));
5553 }
5554 else {
5555 /* The only case in which i == ascii_length is a backslash
5556 followed by a newline. */
5557 assert(i <= ascii_length);
5558 }
5559
Guido van Rossumd57fd912000-03-10 22:53:23 +00005560 /* Non-escape characters are interpreted as Unicode ordinals */
5561 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005562 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005563 continue;
5564 }
5565
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005566 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005567 /* \ - Escapes */
5568 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005569 c = *s++;
5570 if (s > end)
5571 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005572
5573 if (kind == PyUnicode_WCHAR_KIND) {
5574 assert(i < _PyUnicode_WSTR_LENGTH(v));
5575 }
5576 else {
5577 /* The only case in which i == ascii_length is a backslash
5578 followed by a newline. */
5579 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5580 }
5581
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005582 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005583
Benjamin Peterson29060642009-01-31 22:14:21 +00005584 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005585 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005586 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5587 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5588 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5589 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5590 /* FF */
5591 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5592 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5593 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5594 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5595 /* VT */
5596 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5597 /* BEL, not classic C */
5598 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005599
Benjamin Peterson29060642009-01-31 22:14:21 +00005600 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005601 case '0': case '1': case '2': case '3':
5602 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005603 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005604 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005605 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005606 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005607 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005608 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005609 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005610 break;
5611
Benjamin Peterson29060642009-01-31 22:14:21 +00005612 /* hex escapes */
5613 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005614 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005615 digits = 2;
5616 message = "truncated \\xXX escape";
5617 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005618
Benjamin Peterson29060642009-01-31 22:14:21 +00005619 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005620 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005621 digits = 4;
5622 message = "truncated \\uXXXX escape";
5623 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005624
Benjamin Peterson29060642009-01-31 22:14:21 +00005625 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005626 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005627 digits = 8;
5628 message = "truncated \\UXXXXXXXX escape";
5629 hexescape:
5630 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005631 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005632 if (s+digits>end) {
5633 endinpos = size;
5634 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005635 errors, &errorHandler,
5636 "unicodeescape", "end of string in escape sequence",
5637 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005638 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005639 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005640 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005641 goto nextByte;
5642 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005643 for (j = 0; j < digits; ++j) {
5644 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005645 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005646 endinpos = (s+j+1)-starts;
5647 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005648 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005649 errors, &errorHandler,
5650 "unicodeescape", message,
5651 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005652 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005653 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005654 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005655 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005656 }
5657 chr = (chr<<4) & ~0xF;
5658 if (c >= '0' && c <= '9')
5659 chr += c - '0';
5660 else if (c >= 'a' && c <= 'f')
5661 chr += 10 + c - 'a';
5662 else
5663 chr += 10 + c - 'A';
5664 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005665 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005666 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005667 /* _decoding_error will have already written into the
5668 target buffer. */
5669 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005670 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005671 /* when we get here, chr is a 32-bit unicode character */
5672 if (chr <= 0xffff)
5673 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005674 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005675 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005676 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005677 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005678#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005679 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005680#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005681 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005682 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5683 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005684#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005685 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005686 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005687 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005688 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005689 errors, &errorHandler,
5690 "unicodeescape", "illegal Unicode character",
5691 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005692 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005693 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005694 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005695 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005696 break;
5697
Benjamin Peterson29060642009-01-31 22:14:21 +00005698 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005699 case 'N':
5700 message = "malformed \\N character escape";
5701 if (ucnhash_CAPI == NULL) {
5702 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005703 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5704 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005705 if (ucnhash_CAPI == NULL)
5706 goto ucnhashError;
5707 }
5708 if (*s == '{') {
5709 const char *start = s+1;
5710 /* look for the closing brace */
5711 while (*s != '}' && s < end)
5712 s++;
5713 if (s > start && s < end && *s == '}') {
5714 /* found a name. look it up in the unicode database */
5715 message = "unknown Unicode character name";
5716 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005717 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5718 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005719 goto store;
5720 }
5721 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005722 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005723 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005724 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005725 errors, &errorHandler,
5726 "unicodeescape", message,
5727 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005728 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005729 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005730 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005731 break;
5732
5733 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005734 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005735 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005736 message = "\\ at end of string";
5737 s--;
5738 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005739 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005740 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005741 errors, &errorHandler,
5742 "unicodeescape", message,
5743 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005744 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005745 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005746 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005747 }
5748 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005749 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5750 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005751 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005752 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005753 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005754 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005755 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005756 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005757 /* Ensure the length prediction worked in case of ASCII strings */
5758 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5759
Victor Stinnerfe226c02011-10-03 03:52:20 +02005760 if (kind == PyUnicode_WCHAR_KIND)
5761 {
5762 if (PyUnicode_Resize((PyObject**)&v, i) < 0)
5763 goto onError;
Victor Stinnerfe226c02011-10-03 03:52:20 +02005764 }
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005765 Py_XDECREF(errorHandler);
5766 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005767#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005768 if (_PyUnicode_READY_REPLACE(&v)) {
5769 Py_DECREF(v);
5770 return NULL;
5771 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005772#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005773 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005774 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005775
Benjamin Peterson29060642009-01-31 22:14:21 +00005776 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005777 PyErr_SetString(
5778 PyExc_UnicodeError,
5779 "\\N escapes not supported (can't load unicodedata module)"
5780 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005781 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005782 Py_XDECREF(errorHandler);
5783 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005784 return NULL;
5785
Benjamin Peterson29060642009-01-31 22:14:21 +00005786 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005787 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005788 Py_XDECREF(errorHandler);
5789 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005790 return NULL;
5791}
5792
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005793#undef WRITE_ASCII_OR_WSTR
5794#undef WRITE_WSTR
5795
Guido van Rossumd57fd912000-03-10 22:53:23 +00005796/* Return a Unicode-Escape string version of the Unicode object.
5797
5798 If quotes is true, the string is enclosed in u"" or u'' quotes as
5799 appropriate.
5800
5801*/
5802
Walter Dörwald79e913e2007-05-12 11:08:06 +00005803static const char *hexdigits = "0123456789abcdef";
5804
Alexander Belopolsky40018472011-02-26 01:02:56 +00005805PyObject *
5806PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005807 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005808{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005809 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005810 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005811
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005812#ifdef Py_UNICODE_WIDE
5813 const Py_ssize_t expandsize = 10;
5814#else
5815 const Py_ssize_t expandsize = 6;
5816#endif
5817
Thomas Wouters89f507f2006-12-13 04:49:30 +00005818 /* XXX(nnorwitz): rather than over-allocating, it would be
5819 better to choose a different scheme. Perhaps scan the
5820 first N-chars of the string and allocate based on that size.
5821 */
5822 /* Initial allocation is based on the longest-possible unichr
5823 escape.
5824
5825 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5826 unichr, so in this case it's the longest unichr escape. In
5827 narrow (UTF-16) builds this is five chars per source unichr
5828 since there are two unichrs in the surrogate pair, so in narrow
5829 (UTF-16) builds it's not the longest unichr escape.
5830
5831 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5832 so in the narrow (UTF-16) build case it's the longest unichr
5833 escape.
5834 */
5835
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005836 if (size == 0)
5837 return PyBytes_FromStringAndSize(NULL, 0);
5838
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005839 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005840 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005841
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005842 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005843 2
5844 + expandsize*size
5845 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005846 if (repr == NULL)
5847 return NULL;
5848
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005849 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005850
Guido van Rossumd57fd912000-03-10 22:53:23 +00005851 while (size-- > 0) {
5852 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005853
Walter Dörwald79e913e2007-05-12 11:08:06 +00005854 /* Escape backslashes */
5855 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005856 *p++ = '\\';
5857 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005858 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005859 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005860
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005861#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005862 /* Map 21-bit characters to '\U00xxxxxx' */
5863 else if (ch >= 0x10000) {
5864 *p++ = '\\';
5865 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005866 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5867 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5868 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5869 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5870 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5871 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5872 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5873 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005874 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005875 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005876#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005877 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5878 else if (ch >= 0xD800 && ch < 0xDC00) {
5879 Py_UNICODE ch2;
5880 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005881
Benjamin Peterson29060642009-01-31 22:14:21 +00005882 ch2 = *s++;
5883 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005884 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005885 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5886 *p++ = '\\';
5887 *p++ = 'U';
5888 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5889 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5890 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5891 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5892 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5893 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5894 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5895 *p++ = hexdigits[ucs & 0x0000000F];
5896 continue;
5897 }
5898 /* Fall through: isolated surrogates are copied as-is */
5899 s--;
5900 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005901 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005902#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005903
Guido van Rossumd57fd912000-03-10 22:53:23 +00005904 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005905 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005906 *p++ = '\\';
5907 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005908 *p++ = hexdigits[(ch >> 12) & 0x000F];
5909 *p++ = hexdigits[(ch >> 8) & 0x000F];
5910 *p++ = hexdigits[(ch >> 4) & 0x000F];
5911 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005912 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005913
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005914 /* Map special whitespace to '\t', \n', '\r' */
5915 else if (ch == '\t') {
5916 *p++ = '\\';
5917 *p++ = 't';
5918 }
5919 else if (ch == '\n') {
5920 *p++ = '\\';
5921 *p++ = 'n';
5922 }
5923 else if (ch == '\r') {
5924 *p++ = '\\';
5925 *p++ = 'r';
5926 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005927
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005928 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005929 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005930 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005931 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005932 *p++ = hexdigits[(ch >> 4) & 0x000F];
5933 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005934 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005935
Guido van Rossumd57fd912000-03-10 22:53:23 +00005936 /* Copy everything else as-is */
5937 else
5938 *p++ = (char) ch;
5939 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005940
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005941 assert(p - PyBytes_AS_STRING(repr) > 0);
5942 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5943 return NULL;
5944 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005945}
5946
Alexander Belopolsky40018472011-02-26 01:02:56 +00005947PyObject *
5948PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005949{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005950 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005951 if (!PyUnicode_Check(unicode)) {
5952 PyErr_BadArgument();
5953 return NULL;
5954 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00005955 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5956 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005957 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005958}
5959
5960/* --- Raw Unicode Escape Codec ------------------------------------------- */
5961
Alexander Belopolsky40018472011-02-26 01:02:56 +00005962PyObject *
5963PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005964 Py_ssize_t size,
5965 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005966{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005967 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005968 Py_ssize_t startinpos;
5969 Py_ssize_t endinpos;
5970 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005971 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005972 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005973 const char *end;
5974 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005975 PyObject *errorHandler = NULL;
5976 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005977
Guido van Rossumd57fd912000-03-10 22:53:23 +00005978 /* Escaped strings will always be longer than the resulting
5979 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005980 length after conversion to the true value. (But decoding error
5981 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005982 v = _PyUnicode_New(size);
5983 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005984 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005985 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005986 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005987 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005988 end = s + size;
5989 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005990 unsigned char c;
5991 Py_UCS4 x;
5992 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005993 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005994
Benjamin Peterson29060642009-01-31 22:14:21 +00005995 /* Non-escape characters are interpreted as Unicode ordinals */
5996 if (*s != '\\') {
5997 *p++ = (unsigned char)*s++;
5998 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005999 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006000 startinpos = s-starts;
6001
6002 /* \u-escapes are only interpreted iff the number of leading
6003 backslashes if odd */
6004 bs = s;
6005 for (;s < end;) {
6006 if (*s != '\\')
6007 break;
6008 *p++ = (unsigned char)*s++;
6009 }
6010 if (((s - bs) & 1) == 0 ||
6011 s >= end ||
6012 (*s != 'u' && *s != 'U')) {
6013 continue;
6014 }
6015 p--;
6016 count = *s=='u' ? 4 : 8;
6017 s++;
6018
6019 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
6020 outpos = p-PyUnicode_AS_UNICODE(v);
6021 for (x = 0, i = 0; i < count; ++i, ++s) {
6022 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006023 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006024 endinpos = s-starts;
6025 if (unicode_decode_call_errorhandler(
6026 errors, &errorHandler,
6027 "rawunicodeescape", "truncated \\uXXXX",
6028 &starts, &end, &startinpos, &endinpos, &exc, &s,
6029 &v, &outpos, &p))
6030 goto onError;
6031 goto nextByte;
6032 }
6033 x = (x<<4) & ~0xF;
6034 if (c >= '0' && c <= '9')
6035 x += c - '0';
6036 else if (c >= 'a' && c <= 'f')
6037 x += 10 + c - 'a';
6038 else
6039 x += 10 + c - 'A';
6040 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00006041 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00006042 /* UCS-2 character */
6043 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006044 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006045 /* UCS-4 character. Either store directly, or as
6046 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00006047#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006048 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006049#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006050 x -= 0x10000L;
6051 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
6052 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00006053#endif
6054 } else {
6055 endinpos = s-starts;
6056 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006057 if (unicode_decode_call_errorhandler(
6058 errors, &errorHandler,
6059 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006060 &starts, &end, &startinpos, &endinpos, &exc, &s,
6061 &v, &outpos, &p))
6062 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006063 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006064 nextByte:
6065 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006066 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02006067 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006068 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006069 Py_XDECREF(errorHandler);
6070 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006071#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006072 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006073 Py_DECREF(v);
6074 return NULL;
6075 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006076#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006077 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006078 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006079
Benjamin Peterson29060642009-01-31 22:14:21 +00006080 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006081 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006082 Py_XDECREF(errorHandler);
6083 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006084 return NULL;
6085}
6086
Alexander Belopolsky40018472011-02-26 01:02:56 +00006087PyObject *
6088PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006089 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006090{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006091 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006092 char *p;
6093 char *q;
6094
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006095#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006096 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006097#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006098 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006099#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00006100
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006101 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006102 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006103
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006104 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006105 if (repr == NULL)
6106 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00006107 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006108 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006109
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006110 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006111 while (size-- > 0) {
6112 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006113#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006114 /* Map 32-bit characters to '\Uxxxxxxxx' */
6115 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006116 *p++ = '\\';
6117 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00006118 *p++ = hexdigits[(ch >> 28) & 0xf];
6119 *p++ = hexdigits[(ch >> 24) & 0xf];
6120 *p++ = hexdigits[(ch >> 20) & 0xf];
6121 *p++ = hexdigits[(ch >> 16) & 0xf];
6122 *p++ = hexdigits[(ch >> 12) & 0xf];
6123 *p++ = hexdigits[(ch >> 8) & 0xf];
6124 *p++ = hexdigits[(ch >> 4) & 0xf];
6125 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006126 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006127 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00006128#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006129 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
6130 if (ch >= 0xD800 && ch < 0xDC00) {
6131 Py_UNICODE ch2;
6132 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006133
Benjamin Peterson29060642009-01-31 22:14:21 +00006134 ch2 = *s++;
6135 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00006136 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006137 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
6138 *p++ = '\\';
6139 *p++ = 'U';
6140 *p++ = hexdigits[(ucs >> 28) & 0xf];
6141 *p++ = hexdigits[(ucs >> 24) & 0xf];
6142 *p++ = hexdigits[(ucs >> 20) & 0xf];
6143 *p++ = hexdigits[(ucs >> 16) & 0xf];
6144 *p++ = hexdigits[(ucs >> 12) & 0xf];
6145 *p++ = hexdigits[(ucs >> 8) & 0xf];
6146 *p++ = hexdigits[(ucs >> 4) & 0xf];
6147 *p++ = hexdigits[ucs & 0xf];
6148 continue;
6149 }
6150 /* Fall through: isolated surrogates are copied as-is */
6151 s--;
6152 size++;
6153 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006154#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006155 /* Map 16-bit characters to '\uxxxx' */
6156 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006157 *p++ = '\\';
6158 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00006159 *p++ = hexdigits[(ch >> 12) & 0xf];
6160 *p++ = hexdigits[(ch >> 8) & 0xf];
6161 *p++ = hexdigits[(ch >> 4) & 0xf];
6162 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006163 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006164 /* Copy everything else as-is */
6165 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006166 *p++ = (char) ch;
6167 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006168 size = p - q;
6169
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006170 assert(size > 0);
6171 if (_PyBytes_Resize(&repr, size) < 0)
6172 return NULL;
6173 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006174}
6175
Alexander Belopolsky40018472011-02-26 01:02:56 +00006176PyObject *
6177PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006178{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006179 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006180 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00006181 PyErr_BadArgument();
6182 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006183 }
Walter Dörwald711005d2007-05-12 12:03:26 +00006184 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6185 PyUnicode_GET_SIZE(unicode));
6186
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006187 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006188}
6189
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006190/* --- Unicode Internal Codec ------------------------------------------- */
6191
Alexander Belopolsky40018472011-02-26 01:02:56 +00006192PyObject *
6193_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006194 Py_ssize_t size,
6195 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006196{
6197 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006198 Py_ssize_t startinpos;
6199 Py_ssize_t endinpos;
6200 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006201 PyUnicodeObject *v;
6202 Py_UNICODE *p;
6203 const char *end;
6204 const char *reason;
6205 PyObject *errorHandler = NULL;
6206 PyObject *exc = NULL;
6207
Neal Norwitzd43069c2006-01-08 01:12:10 +00006208#ifdef Py_UNICODE_WIDE
6209 Py_UNICODE unimax = PyUnicode_GetMax();
6210#endif
6211
Thomas Wouters89f507f2006-12-13 04:49:30 +00006212 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006213 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
6214 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006215 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006216 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
6217 as string was created with the old API. */
6218 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006219 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006220 p = PyUnicode_AS_UNICODE(v);
6221 end = s + size;
6222
6223 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006224 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006225 /* We have to sanity check the raw data, otherwise doom looms for
6226 some malformed UCS-4 data. */
6227 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006228#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006229 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006230#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006231 end-s < Py_UNICODE_SIZE
6232 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006233 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006234 startinpos = s - starts;
6235 if (end-s < Py_UNICODE_SIZE) {
6236 endinpos = end-starts;
6237 reason = "truncated input";
6238 }
6239 else {
6240 endinpos = s - starts + Py_UNICODE_SIZE;
6241 reason = "illegal code point (> 0x10FFFF)";
6242 }
6243 outpos = p - PyUnicode_AS_UNICODE(v);
6244 if (unicode_decode_call_errorhandler(
6245 errors, &errorHandler,
6246 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006247 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00006248 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006249 goto onError;
6250 }
6251 }
6252 else {
6253 p++;
6254 s += Py_UNICODE_SIZE;
6255 }
6256 }
6257
Victor Stinnerfe226c02011-10-03 03:52:20 +02006258 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006259 goto onError;
6260 Py_XDECREF(errorHandler);
6261 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006262#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006263 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006264 Py_DECREF(v);
6265 return NULL;
6266 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006267#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006268 assert(_PyUnicode_CheckConsistency(v, 1));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006269 return (PyObject *)v;
6270
Benjamin Peterson29060642009-01-31 22:14:21 +00006271 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006272 Py_XDECREF(v);
6273 Py_XDECREF(errorHandler);
6274 Py_XDECREF(exc);
6275 return NULL;
6276}
6277
Guido van Rossumd57fd912000-03-10 22:53:23 +00006278/* --- Latin-1 Codec ------------------------------------------------------ */
6279
Alexander Belopolsky40018472011-02-26 01:02:56 +00006280PyObject *
6281PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006282 Py_ssize_t size,
6283 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006284{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006285 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006286 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006287}
6288
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006289/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006290static void
6291make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006292 const char *encoding,
6293 const Py_UNICODE *unicode, Py_ssize_t size,
6294 Py_ssize_t startpos, Py_ssize_t endpos,
6295 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006296{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006297 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006298 *exceptionObject = PyUnicodeEncodeError_Create(
6299 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006300 }
6301 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006302 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6303 goto onError;
6304 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6305 goto onError;
6306 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6307 goto onError;
6308 return;
6309 onError:
6310 Py_DECREF(*exceptionObject);
6311 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006312 }
6313}
6314
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006315/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006316static void
6317raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006318 const char *encoding,
6319 const Py_UNICODE *unicode, Py_ssize_t size,
6320 Py_ssize_t startpos, Py_ssize_t endpos,
6321 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006322{
6323 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006324 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006325 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006326 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006327}
6328
6329/* error handling callback helper:
6330 build arguments, call the callback and check the arguments,
6331 put the result into newpos and return the replacement string, which
6332 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006333static PyObject *
6334unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006335 PyObject **errorHandler,
6336 const char *encoding, const char *reason,
6337 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
6338 Py_ssize_t startpos, Py_ssize_t endpos,
6339 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006340{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006341 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006342
6343 PyObject *restuple;
6344 PyObject *resunicode;
6345
6346 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006347 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006348 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006349 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006350 }
6351
6352 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006353 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006354 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006355 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006356
6357 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006358 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006359 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006360 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006361 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006362 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006363 Py_DECREF(restuple);
6364 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006365 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006366 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006367 &resunicode, newpos)) {
6368 Py_DECREF(restuple);
6369 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006370 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006371 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6372 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6373 Py_DECREF(restuple);
6374 return NULL;
6375 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006376 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006377 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006378 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006379 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6380 Py_DECREF(restuple);
6381 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006382 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006383 Py_INCREF(resunicode);
6384 Py_DECREF(restuple);
6385 return resunicode;
6386}
6387
Alexander Belopolsky40018472011-02-26 01:02:56 +00006388static PyObject *
6389unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006390 Py_ssize_t size,
6391 const char *errors,
6392 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006393{
6394 /* output object */
6395 PyObject *res;
6396 /* pointers to the beginning and end+1 of input */
6397 const Py_UNICODE *startp = p;
6398 const Py_UNICODE *endp = p + size;
6399 /* pointer to the beginning of the unencodable characters */
6400 /* const Py_UNICODE *badp = NULL; */
6401 /* pointer into the output */
6402 char *str;
6403 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006404 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006405 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6406 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006407 PyObject *errorHandler = NULL;
6408 PyObject *exc = NULL;
6409 /* the following variable is used for caching string comparisons
6410 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6411 int known_errorHandler = -1;
6412
6413 /* allocate enough for a simple encoding without
6414 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006415 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006416 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006417 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006418 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006419 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006420 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006421 ressize = size;
6422
6423 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006424 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006425
Benjamin Peterson29060642009-01-31 22:14:21 +00006426 /* can we encode this? */
6427 if (c<limit) {
6428 /* no overflow check, because we know that the space is enough */
6429 *str++ = (char)c;
6430 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006431 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006432 else {
6433 Py_ssize_t unicodepos = p-startp;
6434 Py_ssize_t requiredsize;
6435 PyObject *repunicode;
6436 Py_ssize_t repsize;
6437 Py_ssize_t newpos;
6438 Py_ssize_t respos;
6439 Py_UNICODE *uni2;
6440 /* startpos for collecting unencodable chars */
6441 const Py_UNICODE *collstart = p;
6442 const Py_UNICODE *collend = p;
6443 /* find all unecodable characters */
6444 while ((collend < endp) && ((*collend)>=limit))
6445 ++collend;
6446 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6447 if (known_errorHandler==-1) {
6448 if ((errors==NULL) || (!strcmp(errors, "strict")))
6449 known_errorHandler = 1;
6450 else if (!strcmp(errors, "replace"))
6451 known_errorHandler = 2;
6452 else if (!strcmp(errors, "ignore"))
6453 known_errorHandler = 3;
6454 else if (!strcmp(errors, "xmlcharrefreplace"))
6455 known_errorHandler = 4;
6456 else
6457 known_errorHandler = 0;
6458 }
6459 switch (known_errorHandler) {
6460 case 1: /* strict */
6461 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
6462 goto onError;
6463 case 2: /* replace */
6464 while (collstart++<collend)
6465 *str++ = '?'; /* fall through */
6466 case 3: /* ignore */
6467 p = collend;
6468 break;
6469 case 4: /* xmlcharrefreplace */
6470 respos = str - PyBytes_AS_STRING(res);
6471 /* determine replacement size (temporarily (mis)uses p) */
6472 for (p = collstart, repsize = 0; p < collend; ++p) {
6473 if (*p<10)
6474 repsize += 2+1+1;
6475 else if (*p<100)
6476 repsize += 2+2+1;
6477 else if (*p<1000)
6478 repsize += 2+3+1;
6479 else if (*p<10000)
6480 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006481#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006482 else
6483 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006484#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006485 else if (*p<100000)
6486 repsize += 2+5+1;
6487 else if (*p<1000000)
6488 repsize += 2+6+1;
6489 else
6490 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006491#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006492 }
6493 requiredsize = respos+repsize+(endp-collend);
6494 if (requiredsize > ressize) {
6495 if (requiredsize<2*ressize)
6496 requiredsize = 2*ressize;
6497 if (_PyBytes_Resize(&res, requiredsize))
6498 goto onError;
6499 str = PyBytes_AS_STRING(res) + respos;
6500 ressize = requiredsize;
6501 }
6502 /* generate replacement (temporarily (mis)uses p) */
6503 for (p = collstart; p < collend; ++p) {
6504 str += sprintf(str, "&#%d;", (int)*p);
6505 }
6506 p = collend;
6507 break;
6508 default:
6509 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6510 encoding, reason, startp, size, &exc,
6511 collstart-startp, collend-startp, &newpos);
6512 if (repunicode == NULL)
6513 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006514 if (PyBytes_Check(repunicode)) {
6515 /* Directly copy bytes result to output. */
6516 repsize = PyBytes_Size(repunicode);
6517 if (repsize > 1) {
6518 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006519 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006520 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6521 Py_DECREF(repunicode);
6522 goto onError;
6523 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006524 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006525 ressize += repsize-1;
6526 }
6527 memcpy(str, PyBytes_AsString(repunicode), repsize);
6528 str += repsize;
6529 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006530 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006531 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006532 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006533 /* need more space? (at least enough for what we
6534 have+the replacement+the rest of the string, so
6535 we won't have to check space for encodable characters) */
6536 respos = str - PyBytes_AS_STRING(res);
6537 repsize = PyUnicode_GET_SIZE(repunicode);
6538 requiredsize = respos+repsize+(endp-collend);
6539 if (requiredsize > ressize) {
6540 if (requiredsize<2*ressize)
6541 requiredsize = 2*ressize;
6542 if (_PyBytes_Resize(&res, requiredsize)) {
6543 Py_DECREF(repunicode);
6544 goto onError;
6545 }
6546 str = PyBytes_AS_STRING(res) + respos;
6547 ressize = requiredsize;
6548 }
6549 /* check if there is anything unencodable in the replacement
6550 and copy it to the output */
6551 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
6552 c = *uni2;
6553 if (c >= limit) {
6554 raise_encode_exception(&exc, encoding, startp, size,
6555 unicodepos, unicodepos+1, reason);
6556 Py_DECREF(repunicode);
6557 goto onError;
6558 }
6559 *str = (char)c;
6560 }
6561 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006562 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006563 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006564 }
6565 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006566 /* Resize if we allocated to much */
6567 size = str - PyBytes_AS_STRING(res);
6568 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006569 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006570 if (_PyBytes_Resize(&res, size) < 0)
6571 goto onError;
6572 }
6573
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006574 Py_XDECREF(errorHandler);
6575 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006576 return res;
6577
6578 onError:
6579 Py_XDECREF(res);
6580 Py_XDECREF(errorHandler);
6581 Py_XDECREF(exc);
6582 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006583}
6584
Alexander Belopolsky40018472011-02-26 01:02:56 +00006585PyObject *
6586PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006587 Py_ssize_t size,
6588 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006589{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006590 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006591}
6592
Alexander Belopolsky40018472011-02-26 01:02:56 +00006593PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006594_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006595{
6596 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006597 PyErr_BadArgument();
6598 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006599 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006600 if (PyUnicode_READY(unicode) == -1)
6601 return NULL;
6602 /* Fast path: if it is a one-byte string, construct
6603 bytes object directly. */
6604 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6605 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6606 PyUnicode_GET_LENGTH(unicode));
6607 /* Non-Latin-1 characters present. Defer to above function to
6608 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006609 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006610 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006611 errors);
6612}
6613
6614PyObject*
6615PyUnicode_AsLatin1String(PyObject *unicode)
6616{
6617 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006618}
6619
6620/* --- 7-bit ASCII Codec -------------------------------------------------- */
6621
Alexander Belopolsky40018472011-02-26 01:02:56 +00006622PyObject *
6623PyUnicode_DecodeASCII(const char *s,
6624 Py_ssize_t size,
6625 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006626{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006627 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006628 PyUnicodeObject *v;
Victor Stinner702c7342011-10-05 13:50:52 +02006629 Py_UNICODE *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006630 Py_ssize_t startinpos;
6631 Py_ssize_t endinpos;
6632 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006633 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006634 int has_error;
6635 const unsigned char *p = (const unsigned char *)s;
6636 const unsigned char *end = p + size;
6637 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006638 PyObject *errorHandler = NULL;
6639 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006640
Guido van Rossumd57fd912000-03-10 22:53:23 +00006641 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006642 if (size == 1 && (unsigned char)s[0] < 128)
6643 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006644
Victor Stinner702c7342011-10-05 13:50:52 +02006645 has_error = 0;
6646 while (p < end && !has_error) {
6647 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6648 an explanation. */
6649 if (!((size_t) p & LONG_PTR_MASK)) {
6650 /* Help register allocation */
6651 register const unsigned char *_p = p;
6652 while (_p < aligned_end) {
6653 unsigned long value = *(unsigned long *) _p;
6654 if (value & ASCII_CHAR_MASK) {
6655 has_error = 1;
6656 break;
6657 }
6658 _p += SIZEOF_LONG;
6659 }
6660 if (_p == end)
6661 break;
6662 if (has_error)
6663 break;
6664 p = _p;
6665 }
6666 if (*p & 0x80) {
6667 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006668 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006669 }
6670 else {
6671 ++p;
6672 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006673 }
Victor Stinner702c7342011-10-05 13:50:52 +02006674 if (!has_error)
6675 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006676
Guido van Rossumd57fd912000-03-10 22:53:23 +00006677 v = _PyUnicode_New(size);
6678 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006679 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006680 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006681 return (PyObject *)v;
Victor Stinner702c7342011-10-05 13:50:52 +02006682 u = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006683 e = s + size;
6684 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006685 register unsigned char c = (unsigned char)*s;
6686 if (c < 128) {
Victor Stinner702c7342011-10-05 13:50:52 +02006687 *u++ = c;
Benjamin Peterson29060642009-01-31 22:14:21 +00006688 ++s;
6689 }
6690 else {
6691 startinpos = s-starts;
6692 endinpos = startinpos + 1;
Victor Stinner702c7342011-10-05 13:50:52 +02006693 outpos = u - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006694 if (unicode_decode_call_errorhandler(
6695 errors, &errorHandler,
6696 "ascii", "ordinal not in range(128)",
6697 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinner702c7342011-10-05 13:50:52 +02006698 &v, &outpos, &u))
Benjamin Peterson29060642009-01-31 22:14:21 +00006699 goto onError;
6700 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006701 }
Victor Stinner702c7342011-10-05 13:50:52 +02006702 if (u - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
6703 if (PyUnicode_Resize((PyObject**)&v, u - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006704 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006705 Py_XDECREF(errorHandler);
6706 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006707#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006708 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006709 Py_DECREF(v);
6710 return NULL;
6711 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006712#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006713 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006714 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006715
Benjamin Peterson29060642009-01-31 22:14:21 +00006716 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006717 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006718 Py_XDECREF(errorHandler);
6719 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006720 return NULL;
6721}
6722
Alexander Belopolsky40018472011-02-26 01:02:56 +00006723PyObject *
6724PyUnicode_EncodeASCII(const Py_UNICODE *p,
6725 Py_ssize_t size,
6726 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006727{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006728 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006729}
6730
Alexander Belopolsky40018472011-02-26 01:02:56 +00006731PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006732_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006733{
6734 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006735 PyErr_BadArgument();
6736 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006737 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006738 if (PyUnicode_READY(unicode) == -1)
6739 return NULL;
6740 /* Fast path: if it is an ASCII-only string, construct bytes object
6741 directly. Else defer to above function to raise the exception. */
6742 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6743 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6744 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006745 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006746 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006747 errors);
6748}
6749
6750PyObject *
6751PyUnicode_AsASCIIString(PyObject *unicode)
6752{
6753 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006754}
6755
Victor Stinner99b95382011-07-04 14:23:54 +02006756#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006757
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006758/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006759
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006760#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006761#define NEED_RETRY
6762#endif
6763
6764/* XXX This code is limited to "true" double-byte encodings, as
6765 a) it assumes an incomplete character consists of a single byte, and
6766 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006767 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006768
Alexander Belopolsky40018472011-02-26 01:02:56 +00006769static int
6770is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006771{
6772 const char *curr = s + offset;
6773
6774 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006775 const char *prev = CharPrev(s, curr);
6776 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006777 }
6778 return 0;
6779}
6780
6781/*
6782 * Decode MBCS string into unicode object. If 'final' is set, converts
6783 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6784 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006785static int
6786decode_mbcs(PyUnicodeObject **v,
6787 const char *s, /* MBCS string */
6788 int size, /* sizeof MBCS string */
6789 int final,
6790 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006791{
6792 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006793 Py_ssize_t n;
6794 DWORD usize;
6795 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006796
6797 assert(size >= 0);
6798
Victor Stinner554f3f02010-06-16 23:33:54 +00006799 /* check and handle 'errors' arg */
6800 if (errors==NULL || strcmp(errors, "strict")==0)
6801 flags = MB_ERR_INVALID_CHARS;
6802 else if (strcmp(errors, "ignore")==0)
6803 flags = 0;
6804 else {
6805 PyErr_Format(PyExc_ValueError,
6806 "mbcs encoding does not support errors='%s'",
6807 errors);
6808 return -1;
6809 }
6810
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006811 /* Skip trailing lead-byte unless 'final' is set */
6812 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006813 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006814
6815 /* First get the size of the result */
6816 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006817 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6818 if (usize==0)
6819 goto mbcs_decode_error;
6820 } else
6821 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006822
6823 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006824 /* Create unicode object */
6825 *v = _PyUnicode_New(usize);
6826 if (*v == NULL)
6827 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006828 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006829 }
6830 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006831 /* Extend unicode object */
6832 n = PyUnicode_GET_SIZE(*v);
Victor Stinner2fd82272011-10-03 04:06:05 +02006833 if (PyUnicode_Resize((PyObject**)v, n + usize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006834 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006835 }
6836
6837 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006838 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006839 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006840 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6841 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006842 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006843 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006844 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006845
6846mbcs_decode_error:
6847 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6848 we raise a UnicodeDecodeError - else it is a 'generic'
6849 windows error
6850 */
6851 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6852 /* Ideally, we should get reason from FormatMessage - this
6853 is the Windows 2000 English version of the message
6854 */
6855 PyObject *exc = NULL;
6856 const char *reason = "No mapping for the Unicode character exists "
6857 "in the target multi-byte code page.";
6858 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6859 if (exc != NULL) {
6860 PyCodec_StrictErrors(exc);
6861 Py_DECREF(exc);
6862 }
6863 } else {
6864 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6865 }
6866 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006867}
6868
Alexander Belopolsky40018472011-02-26 01:02:56 +00006869PyObject *
6870PyUnicode_DecodeMBCSStateful(const char *s,
6871 Py_ssize_t size,
6872 const char *errors,
6873 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006874{
6875 PyUnicodeObject *v = NULL;
6876 int done;
6877
6878 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006879 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006880
6881#ifdef NEED_RETRY
6882 retry:
6883 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006884 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006885 else
6886#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006887 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006888
6889 if (done < 0) {
6890 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006891 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006892 }
6893
6894 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006895 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006896
6897#ifdef NEED_RETRY
6898 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006899 s += done;
6900 size -= done;
6901 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006902 }
6903#endif
Victor Stinner17efeed2011-10-04 20:05:46 +02006904#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006905 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006906 Py_DECREF(v);
6907 return NULL;
6908 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006909#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006910 assert(_PyUnicode_CheckConsistency(v, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006911 return (PyObject *)v;
6912}
6913
Alexander Belopolsky40018472011-02-26 01:02:56 +00006914PyObject *
6915PyUnicode_DecodeMBCS(const char *s,
6916 Py_ssize_t size,
6917 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006918{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006919 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6920}
6921
6922/*
6923 * Convert unicode into string object (MBCS).
6924 * Returns 0 if succeed, -1 otherwise.
6925 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006926static int
6927encode_mbcs(PyObject **repr,
6928 const Py_UNICODE *p, /* unicode */
6929 int size, /* size of unicode */
6930 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006931{
Victor Stinner554f3f02010-06-16 23:33:54 +00006932 BOOL usedDefaultChar = FALSE;
6933 BOOL *pusedDefaultChar;
6934 int mbcssize;
6935 Py_ssize_t n;
6936 PyObject *exc = NULL;
6937 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006938
6939 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006940
Victor Stinner554f3f02010-06-16 23:33:54 +00006941 /* check and handle 'errors' arg */
6942 if (errors==NULL || strcmp(errors, "strict")==0) {
6943 flags = WC_NO_BEST_FIT_CHARS;
6944 pusedDefaultChar = &usedDefaultChar;
6945 } else if (strcmp(errors, "replace")==0) {
6946 flags = 0;
6947 pusedDefaultChar = NULL;
6948 } else {
6949 PyErr_Format(PyExc_ValueError,
6950 "mbcs encoding does not support errors='%s'",
6951 errors);
6952 return -1;
6953 }
6954
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006955 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006956 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006957 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6958 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00006959 if (mbcssize == 0) {
6960 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6961 return -1;
6962 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006963 /* If we used a default char, then we failed! */
6964 if (pusedDefaultChar && *pusedDefaultChar)
6965 goto mbcs_encode_error;
6966 } else {
6967 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006968 }
6969
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006970 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006971 /* Create string object */
6972 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6973 if (*repr == NULL)
6974 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006975 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006976 }
6977 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006978 /* Extend string object */
6979 n = PyBytes_Size(*repr);
6980 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6981 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006982 }
6983
6984 /* Do the conversion */
6985 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006986 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006987 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6988 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006989 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6990 return -1;
6991 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006992 if (pusedDefaultChar && *pusedDefaultChar)
6993 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006994 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006995 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006996
6997mbcs_encode_error:
6998 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6999 Py_XDECREF(exc);
7000 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007001}
7002
Alexander Belopolsky40018472011-02-26 01:02:56 +00007003PyObject *
7004PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7005 Py_ssize_t size,
7006 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007007{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007008 PyObject *repr = NULL;
7009 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00007010
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007011#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00007012 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007013 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00007014 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007015 else
7016#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00007017 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007018
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007019 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007020 Py_XDECREF(repr);
7021 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007022 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007023
7024#ifdef NEED_RETRY
7025 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007026 p += INT_MAX;
7027 size -= INT_MAX;
7028 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007029 }
7030#endif
7031
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007032 return repr;
7033}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007034
Alexander Belopolsky40018472011-02-26 01:02:56 +00007035PyObject *
7036PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007037{
7038 if (!PyUnicode_Check(unicode)) {
7039 PyErr_BadArgument();
7040 return NULL;
7041 }
7042 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007043 PyUnicode_GET_SIZE(unicode),
7044 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007045}
7046
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007047#undef NEED_RETRY
7048
Victor Stinner99b95382011-07-04 14:23:54 +02007049#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007050
Guido van Rossumd57fd912000-03-10 22:53:23 +00007051/* --- Character Mapping Codec -------------------------------------------- */
7052
Alexander Belopolsky40018472011-02-26 01:02:56 +00007053PyObject *
7054PyUnicode_DecodeCharmap(const char *s,
7055 Py_ssize_t size,
7056 PyObject *mapping,
7057 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007058{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007059 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007060 Py_ssize_t startinpos;
7061 Py_ssize_t endinpos;
7062 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007063 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007064 PyUnicodeObject *v;
7065 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007066 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007067 PyObject *errorHandler = NULL;
7068 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007069 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007070 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00007071
Guido van Rossumd57fd912000-03-10 22:53:23 +00007072 /* Default to Latin-1 */
7073 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007074 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007075
7076 v = _PyUnicode_New(size);
7077 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007078 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007079 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007080 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007081 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007082 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007083 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007084 mapstring = PyUnicode_AS_UNICODE(mapping);
7085 maplen = PyUnicode_GET_SIZE(mapping);
7086 while (s < e) {
7087 unsigned char ch = *s;
7088 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007089
Benjamin Peterson29060642009-01-31 22:14:21 +00007090 if (ch < maplen)
7091 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00007092
Benjamin Peterson29060642009-01-31 22:14:21 +00007093 if (x == 0xfffe) {
7094 /* undefined mapping */
7095 outpos = p-PyUnicode_AS_UNICODE(v);
7096 startinpos = s-starts;
7097 endinpos = startinpos+1;
7098 if (unicode_decode_call_errorhandler(
7099 errors, &errorHandler,
7100 "charmap", "character maps to <undefined>",
7101 &starts, &e, &startinpos, &endinpos, &exc, &s,
7102 &v, &outpos, &p)) {
7103 goto onError;
7104 }
7105 continue;
7106 }
7107 *p++ = x;
7108 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007109 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007110 }
7111 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007112 while (s < e) {
7113 unsigned char ch = *s;
7114 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007115
Benjamin Peterson29060642009-01-31 22:14:21 +00007116 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7117 w = PyLong_FromLong((long)ch);
7118 if (w == NULL)
7119 goto onError;
7120 x = PyObject_GetItem(mapping, w);
7121 Py_DECREF(w);
7122 if (x == NULL) {
7123 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7124 /* No mapping found means: mapping is undefined. */
7125 PyErr_Clear();
7126 x = Py_None;
7127 Py_INCREF(x);
7128 } else
7129 goto onError;
7130 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007131
Benjamin Peterson29060642009-01-31 22:14:21 +00007132 /* Apply mapping */
7133 if (PyLong_Check(x)) {
7134 long value = PyLong_AS_LONG(x);
7135 if (value < 0 || value > 65535) {
7136 PyErr_SetString(PyExc_TypeError,
7137 "character mapping must be in range(65536)");
7138 Py_DECREF(x);
7139 goto onError;
7140 }
7141 *p++ = (Py_UNICODE)value;
7142 }
7143 else if (x == Py_None) {
7144 /* undefined mapping */
7145 outpos = p-PyUnicode_AS_UNICODE(v);
7146 startinpos = s-starts;
7147 endinpos = startinpos+1;
7148 if (unicode_decode_call_errorhandler(
7149 errors, &errorHandler,
7150 "charmap", "character maps to <undefined>",
7151 &starts, &e, &startinpos, &endinpos, &exc, &s,
7152 &v, &outpos, &p)) {
7153 Py_DECREF(x);
7154 goto onError;
7155 }
7156 Py_DECREF(x);
7157 continue;
7158 }
7159 else if (PyUnicode_Check(x)) {
7160 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007161
Benjamin Peterson29060642009-01-31 22:14:21 +00007162 if (targetsize == 1)
7163 /* 1-1 mapping */
7164 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007165
Benjamin Peterson29060642009-01-31 22:14:21 +00007166 else if (targetsize > 1) {
7167 /* 1-n mapping */
7168 if (targetsize > extrachars) {
7169 /* resize first */
7170 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
7171 Py_ssize_t needed = (targetsize - extrachars) + \
7172 (targetsize << 2);
7173 extrachars += needed;
7174 /* XXX overflow detection missing */
Victor Stinnerfe226c02011-10-03 03:52:20 +02007175 if (PyUnicode_Resize((PyObject**)&v,
Benjamin Peterson29060642009-01-31 22:14:21 +00007176 PyUnicode_GET_SIZE(v) + needed) < 0) {
7177 Py_DECREF(x);
7178 goto onError;
7179 }
7180 p = PyUnicode_AS_UNICODE(v) + oldpos;
7181 }
7182 Py_UNICODE_COPY(p,
7183 PyUnicode_AS_UNICODE(x),
7184 targetsize);
7185 p += targetsize;
7186 extrachars -= targetsize;
7187 }
7188 /* 1-0 mapping: skip the character */
7189 }
7190 else {
7191 /* wrong return value */
7192 PyErr_SetString(PyExc_TypeError,
7193 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007194 Py_DECREF(x);
7195 goto onError;
7196 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007197 Py_DECREF(x);
7198 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007199 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007200 }
7201 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02007202 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007203 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007204 Py_XDECREF(errorHandler);
7205 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02007206#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007207 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007208 Py_DECREF(v);
7209 return NULL;
7210 }
Victor Stinner17efeed2011-10-04 20:05:46 +02007211#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007212 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007213 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00007214
Benjamin Peterson29060642009-01-31 22:14:21 +00007215 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007216 Py_XDECREF(errorHandler);
7217 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007218 Py_XDECREF(v);
7219 return NULL;
7220}
7221
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007222/* Charmap encoding: the lookup table */
7223
Alexander Belopolsky40018472011-02-26 01:02:56 +00007224struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007225 PyObject_HEAD
7226 unsigned char level1[32];
7227 int count2, count3;
7228 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007229};
7230
7231static PyObject*
7232encoding_map_size(PyObject *obj, PyObject* args)
7233{
7234 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007235 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007236 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007237}
7238
7239static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007240 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007241 PyDoc_STR("Return the size (in bytes) of this object") },
7242 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007243};
7244
7245static void
7246encoding_map_dealloc(PyObject* o)
7247{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007248 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007249}
7250
7251static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007252 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007253 "EncodingMap", /*tp_name*/
7254 sizeof(struct encoding_map), /*tp_basicsize*/
7255 0, /*tp_itemsize*/
7256 /* methods */
7257 encoding_map_dealloc, /*tp_dealloc*/
7258 0, /*tp_print*/
7259 0, /*tp_getattr*/
7260 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007261 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007262 0, /*tp_repr*/
7263 0, /*tp_as_number*/
7264 0, /*tp_as_sequence*/
7265 0, /*tp_as_mapping*/
7266 0, /*tp_hash*/
7267 0, /*tp_call*/
7268 0, /*tp_str*/
7269 0, /*tp_getattro*/
7270 0, /*tp_setattro*/
7271 0, /*tp_as_buffer*/
7272 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7273 0, /*tp_doc*/
7274 0, /*tp_traverse*/
7275 0, /*tp_clear*/
7276 0, /*tp_richcompare*/
7277 0, /*tp_weaklistoffset*/
7278 0, /*tp_iter*/
7279 0, /*tp_iternext*/
7280 encoding_map_methods, /*tp_methods*/
7281 0, /*tp_members*/
7282 0, /*tp_getset*/
7283 0, /*tp_base*/
7284 0, /*tp_dict*/
7285 0, /*tp_descr_get*/
7286 0, /*tp_descr_set*/
7287 0, /*tp_dictoffset*/
7288 0, /*tp_init*/
7289 0, /*tp_alloc*/
7290 0, /*tp_new*/
7291 0, /*tp_free*/
7292 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007293};
7294
7295PyObject*
7296PyUnicode_BuildEncodingMap(PyObject* string)
7297{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007298 PyObject *result;
7299 struct encoding_map *mresult;
7300 int i;
7301 int need_dict = 0;
7302 unsigned char level1[32];
7303 unsigned char level2[512];
7304 unsigned char *mlevel1, *mlevel2, *mlevel3;
7305 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007306 int kind;
7307 void *data;
7308 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007309
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007310 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007311 PyErr_BadArgument();
7312 return NULL;
7313 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007314 kind = PyUnicode_KIND(string);
7315 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007316 memset(level1, 0xFF, sizeof level1);
7317 memset(level2, 0xFF, sizeof level2);
7318
7319 /* If there isn't a one-to-one mapping of NULL to \0,
7320 or if there are non-BMP characters, we need to use
7321 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007322 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007323 need_dict = 1;
7324 for (i = 1; i < 256; i++) {
7325 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007326 ch = PyUnicode_READ(kind, data, i);
7327 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007328 need_dict = 1;
7329 break;
7330 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007331 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007332 /* unmapped character */
7333 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007334 l1 = ch >> 11;
7335 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007336 if (level1[l1] == 0xFF)
7337 level1[l1] = count2++;
7338 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007339 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007340 }
7341
7342 if (count2 >= 0xFF || count3 >= 0xFF)
7343 need_dict = 1;
7344
7345 if (need_dict) {
7346 PyObject *result = PyDict_New();
7347 PyObject *key, *value;
7348 if (!result)
7349 return NULL;
7350 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007351 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007352 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007353 if (!key || !value)
7354 goto failed1;
7355 if (PyDict_SetItem(result, key, value) == -1)
7356 goto failed1;
7357 Py_DECREF(key);
7358 Py_DECREF(value);
7359 }
7360 return result;
7361 failed1:
7362 Py_XDECREF(key);
7363 Py_XDECREF(value);
7364 Py_DECREF(result);
7365 return NULL;
7366 }
7367
7368 /* Create a three-level trie */
7369 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7370 16*count2 + 128*count3 - 1);
7371 if (!result)
7372 return PyErr_NoMemory();
7373 PyObject_Init(result, &EncodingMapType);
7374 mresult = (struct encoding_map*)result;
7375 mresult->count2 = count2;
7376 mresult->count3 = count3;
7377 mlevel1 = mresult->level1;
7378 mlevel2 = mresult->level23;
7379 mlevel3 = mresult->level23 + 16*count2;
7380 memcpy(mlevel1, level1, 32);
7381 memset(mlevel2, 0xFF, 16*count2);
7382 memset(mlevel3, 0, 128*count3);
7383 count3 = 0;
7384 for (i = 1; i < 256; i++) {
7385 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007386 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007387 /* unmapped character */
7388 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007389 o1 = PyUnicode_READ(kind, data, i)>>11;
7390 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007391 i2 = 16*mlevel1[o1] + o2;
7392 if (mlevel2[i2] == 0xFF)
7393 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007394 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007395 i3 = 128*mlevel2[i2] + o3;
7396 mlevel3[i3] = i;
7397 }
7398 return result;
7399}
7400
7401static int
7402encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
7403{
7404 struct encoding_map *map = (struct encoding_map*)mapping;
7405 int l1 = c>>11;
7406 int l2 = (c>>7) & 0xF;
7407 int l3 = c & 0x7F;
7408 int i;
7409
7410#ifdef Py_UNICODE_WIDE
7411 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007412 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007413 }
7414#endif
7415 if (c == 0)
7416 return 0;
7417 /* level 1*/
7418 i = map->level1[l1];
7419 if (i == 0xFF) {
7420 return -1;
7421 }
7422 /* level 2*/
7423 i = map->level23[16*i+l2];
7424 if (i == 0xFF) {
7425 return -1;
7426 }
7427 /* level 3 */
7428 i = map->level23[16*map->count2 + 128*i + l3];
7429 if (i == 0) {
7430 return -1;
7431 }
7432 return i;
7433}
7434
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007435/* Lookup the character ch in the mapping. If the character
7436 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007437 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007438static PyObject *
7439charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007440{
Christian Heimes217cfd12007-12-02 14:31:20 +00007441 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007442 PyObject *x;
7443
7444 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007445 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007446 x = PyObject_GetItem(mapping, w);
7447 Py_DECREF(w);
7448 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007449 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7450 /* No mapping found means: mapping is undefined. */
7451 PyErr_Clear();
7452 x = Py_None;
7453 Py_INCREF(x);
7454 return x;
7455 } else
7456 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007457 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007458 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007459 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007460 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007461 long value = PyLong_AS_LONG(x);
7462 if (value < 0 || value > 255) {
7463 PyErr_SetString(PyExc_TypeError,
7464 "character mapping must be in range(256)");
7465 Py_DECREF(x);
7466 return NULL;
7467 }
7468 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007469 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007470 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007471 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007472 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007473 /* wrong return value */
7474 PyErr_Format(PyExc_TypeError,
7475 "character mapping must return integer, bytes or None, not %.400s",
7476 x->ob_type->tp_name);
7477 Py_DECREF(x);
7478 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007479 }
7480}
7481
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007482static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007483charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007484{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007485 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7486 /* exponentially overallocate to minimize reallocations */
7487 if (requiredsize < 2*outsize)
7488 requiredsize = 2*outsize;
7489 if (_PyBytes_Resize(outobj, requiredsize))
7490 return -1;
7491 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007492}
7493
Benjamin Peterson14339b62009-01-31 16:36:08 +00007494typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007495 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007496} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007497/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007498 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007499 space is available. Return a new reference to the object that
7500 was put in the output buffer, or Py_None, if the mapping was undefined
7501 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007502 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007503static charmapencode_result
7504charmapencode_output(Py_UNICODE c, PyObject *mapping,
7505 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007506{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007507 PyObject *rep;
7508 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007509 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007510
Christian Heimes90aa7642007-12-19 02:45:37 +00007511 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007512 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007513 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007514 if (res == -1)
7515 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007516 if (outsize<requiredsize)
7517 if (charmapencode_resize(outobj, outpos, requiredsize))
7518 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007519 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007520 outstart[(*outpos)++] = (char)res;
7521 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007522 }
7523
7524 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007525 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007526 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007527 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007528 Py_DECREF(rep);
7529 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007530 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007531 if (PyLong_Check(rep)) {
7532 Py_ssize_t requiredsize = *outpos+1;
7533 if (outsize<requiredsize)
7534 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7535 Py_DECREF(rep);
7536 return enc_EXCEPTION;
7537 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007538 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007539 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007540 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007541 else {
7542 const char *repchars = PyBytes_AS_STRING(rep);
7543 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7544 Py_ssize_t requiredsize = *outpos+repsize;
7545 if (outsize<requiredsize)
7546 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7547 Py_DECREF(rep);
7548 return enc_EXCEPTION;
7549 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007550 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007551 memcpy(outstart + *outpos, repchars, repsize);
7552 *outpos += repsize;
7553 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007554 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007555 Py_DECREF(rep);
7556 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007557}
7558
7559/* handle an error in PyUnicode_EncodeCharmap
7560 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007561static int
7562charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00007563 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007564 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007565 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007566 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007567{
7568 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007569 Py_ssize_t repsize;
7570 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007571 Py_UNICODE *uni2;
7572 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007573 Py_ssize_t collstartpos = *inpos;
7574 Py_ssize_t collendpos = *inpos+1;
7575 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007576 char *encoding = "charmap";
7577 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007578 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007579
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007580 /* find all unencodable characters */
7581 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007582 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007583 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007584 int res = encoding_map_lookup(p[collendpos], mapping);
7585 if (res != -1)
7586 break;
7587 ++collendpos;
7588 continue;
7589 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007590
Benjamin Peterson29060642009-01-31 22:14:21 +00007591 rep = charmapencode_lookup(p[collendpos], mapping);
7592 if (rep==NULL)
7593 return -1;
7594 else if (rep!=Py_None) {
7595 Py_DECREF(rep);
7596 break;
7597 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007598 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007599 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007600 }
7601 /* cache callback name lookup
7602 * (if not done yet, i.e. it's the first error) */
7603 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007604 if ((errors==NULL) || (!strcmp(errors, "strict")))
7605 *known_errorHandler = 1;
7606 else if (!strcmp(errors, "replace"))
7607 *known_errorHandler = 2;
7608 else if (!strcmp(errors, "ignore"))
7609 *known_errorHandler = 3;
7610 else if (!strcmp(errors, "xmlcharrefreplace"))
7611 *known_errorHandler = 4;
7612 else
7613 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007614 }
7615 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007616 case 1: /* strict */
7617 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7618 return -1;
7619 case 2: /* replace */
7620 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007621 x = charmapencode_output('?', mapping, res, respos);
7622 if (x==enc_EXCEPTION) {
7623 return -1;
7624 }
7625 else if (x==enc_FAILED) {
7626 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7627 return -1;
7628 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007629 }
7630 /* fall through */
7631 case 3: /* ignore */
7632 *inpos = collendpos;
7633 break;
7634 case 4: /* xmlcharrefreplace */
7635 /* generate replacement (temporarily (mis)uses p) */
7636 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007637 char buffer[2+29+1+1];
7638 char *cp;
7639 sprintf(buffer, "&#%d;", (int)p[collpos]);
7640 for (cp = buffer; *cp; ++cp) {
7641 x = charmapencode_output(*cp, mapping, res, respos);
7642 if (x==enc_EXCEPTION)
7643 return -1;
7644 else if (x==enc_FAILED) {
7645 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7646 return -1;
7647 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007648 }
7649 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007650 *inpos = collendpos;
7651 break;
7652 default:
7653 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00007654 encoding, reason, p, size, exceptionObject,
7655 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007656 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007657 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007658 if (PyBytes_Check(repunicode)) {
7659 /* Directly copy bytes result to output. */
7660 Py_ssize_t outsize = PyBytes_Size(*res);
7661 Py_ssize_t requiredsize;
7662 repsize = PyBytes_Size(repunicode);
7663 requiredsize = *respos + repsize;
7664 if (requiredsize > outsize)
7665 /* Make room for all additional bytes. */
7666 if (charmapencode_resize(res, respos, requiredsize)) {
7667 Py_DECREF(repunicode);
7668 return -1;
7669 }
7670 memcpy(PyBytes_AsString(*res) + *respos,
7671 PyBytes_AsString(repunicode), repsize);
7672 *respos += repsize;
7673 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007674 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007675 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007676 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007677 /* generate replacement */
7678 repsize = PyUnicode_GET_SIZE(repunicode);
7679 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007680 x = charmapencode_output(*uni2, mapping, res, respos);
7681 if (x==enc_EXCEPTION) {
7682 return -1;
7683 }
7684 else if (x==enc_FAILED) {
7685 Py_DECREF(repunicode);
7686 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7687 return -1;
7688 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007689 }
7690 *inpos = newpos;
7691 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007692 }
7693 return 0;
7694}
7695
Alexander Belopolsky40018472011-02-26 01:02:56 +00007696PyObject *
7697PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7698 Py_ssize_t size,
7699 PyObject *mapping,
7700 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007701{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007702 /* output object */
7703 PyObject *res = NULL;
7704 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007705 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007706 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007707 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007708 PyObject *errorHandler = NULL;
7709 PyObject *exc = NULL;
7710 /* the following variable is used for caching string comparisons
7711 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7712 * 3=ignore, 4=xmlcharrefreplace */
7713 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007714
7715 /* Default to Latin-1 */
7716 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007717 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007718
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007719 /* allocate enough for a simple encoding without
7720 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007721 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007722 if (res == NULL)
7723 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007724 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007725 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007726
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007727 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007728 /* try to encode it */
7729 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7730 if (x==enc_EXCEPTION) /* error */
7731 goto onError;
7732 if (x==enc_FAILED) { /* unencodable character */
7733 if (charmap_encoding_error(p, size, &inpos, mapping,
7734 &exc,
7735 &known_errorHandler, &errorHandler, errors,
7736 &res, &respos)) {
7737 goto onError;
7738 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007739 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007740 else
7741 /* done with this character => adjust input position */
7742 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007743 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007744
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007745 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007746 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007747 if (_PyBytes_Resize(&res, respos) < 0)
7748 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007749
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007750 Py_XDECREF(exc);
7751 Py_XDECREF(errorHandler);
7752 return res;
7753
Benjamin Peterson29060642009-01-31 22:14:21 +00007754 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007755 Py_XDECREF(res);
7756 Py_XDECREF(exc);
7757 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007758 return NULL;
7759}
7760
Alexander Belopolsky40018472011-02-26 01:02:56 +00007761PyObject *
7762PyUnicode_AsCharmapString(PyObject *unicode,
7763 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007764{
7765 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007766 PyErr_BadArgument();
7767 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007768 }
7769 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007770 PyUnicode_GET_SIZE(unicode),
7771 mapping,
7772 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007773}
7774
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007775/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007776static void
7777make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007778 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007779 Py_ssize_t startpos, Py_ssize_t endpos,
7780 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007781{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007782 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007783 *exceptionObject = _PyUnicodeTranslateError_Create(
7784 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007785 }
7786 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007787 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7788 goto onError;
7789 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7790 goto onError;
7791 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7792 goto onError;
7793 return;
7794 onError:
7795 Py_DECREF(*exceptionObject);
7796 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007797 }
7798}
7799
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007800/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007801static void
7802raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007803 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007804 Py_ssize_t startpos, Py_ssize_t endpos,
7805 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007806{
7807 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007808 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007809 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007810 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007811}
7812
7813/* error handling callback helper:
7814 build arguments, call the callback and check the arguments,
7815 put the result into newpos and return the replacement string, which
7816 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007817static PyObject *
7818unicode_translate_call_errorhandler(const char *errors,
7819 PyObject **errorHandler,
7820 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007821 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007822 Py_ssize_t startpos, Py_ssize_t endpos,
7823 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007824{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007825 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007826
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007827 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007828 PyObject *restuple;
7829 PyObject *resunicode;
7830
7831 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007832 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007833 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007834 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007835 }
7836
7837 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007838 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007839 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007840 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007841
7842 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007843 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007844 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007845 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007846 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007847 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007848 Py_DECREF(restuple);
7849 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007850 }
7851 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007852 &resunicode, &i_newpos)) {
7853 Py_DECREF(restuple);
7854 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007855 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007856 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007857 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007858 else
7859 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007860 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007861 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7862 Py_DECREF(restuple);
7863 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007864 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007865 Py_INCREF(resunicode);
7866 Py_DECREF(restuple);
7867 return resunicode;
7868}
7869
7870/* Lookup the character ch in the mapping and put the result in result,
7871 which must be decrefed by the caller.
7872 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007873static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007874charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007875{
Christian Heimes217cfd12007-12-02 14:31:20 +00007876 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007877 PyObject *x;
7878
7879 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007880 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007881 x = PyObject_GetItem(mapping, w);
7882 Py_DECREF(w);
7883 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007884 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7885 /* No mapping found means: use 1:1 mapping. */
7886 PyErr_Clear();
7887 *result = NULL;
7888 return 0;
7889 } else
7890 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007891 }
7892 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007893 *result = x;
7894 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007895 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007896 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007897 long value = PyLong_AS_LONG(x);
7898 long max = PyUnicode_GetMax();
7899 if (value < 0 || value > max) {
7900 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007901 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007902 Py_DECREF(x);
7903 return -1;
7904 }
7905 *result = x;
7906 return 0;
7907 }
7908 else if (PyUnicode_Check(x)) {
7909 *result = x;
7910 return 0;
7911 }
7912 else {
7913 /* wrong return value */
7914 PyErr_SetString(PyExc_TypeError,
7915 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007916 Py_DECREF(x);
7917 return -1;
7918 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007919}
7920/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00007921 if not reallocate and adjust various state variables.
7922 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007923static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007924charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00007925 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007926{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007927 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00007928 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007929 /* exponentially overallocate to minimize reallocations */
7930 if (requiredsize < 2 * oldsize)
7931 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007932 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7933 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007934 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007935 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007936 }
7937 return 0;
7938}
7939/* lookup the character, put the result in the output string and adjust
7940 various state variables. Return a new reference to the object that
7941 was put in the output buffer in *result, or Py_None, if the mapping was
7942 undefined (in which case no character was written).
7943 The called must decref result.
7944 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007945static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007946charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7947 PyObject *mapping, Py_UCS4 **output,
7948 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007949 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007950{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007951 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7952 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00007953 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007954 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007955 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007956 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007957 }
7958 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007959 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00007960 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007961 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007962 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007963 }
7964 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007965 Py_ssize_t repsize;
7966 if (PyUnicode_READY(*res) == -1)
7967 return -1;
7968 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00007969 if (repsize==1) {
7970 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007971 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00007972 }
7973 else if (repsize!=0) {
7974 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007975 Py_ssize_t requiredsize = *opos +
7976 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00007977 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007978 Py_ssize_t i;
7979 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007980 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007981 for(i = 0; i < repsize; i++)
7982 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00007983 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007984 }
7985 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007986 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007987 return 0;
7988}
7989
Alexander Belopolsky40018472011-02-26 01:02:56 +00007990PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007991_PyUnicode_TranslateCharmap(PyObject *input,
7992 PyObject *mapping,
7993 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007994{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007995 /* input object */
7996 char *idata;
7997 Py_ssize_t size, i;
7998 int kind;
7999 /* output buffer */
8000 Py_UCS4 *output = NULL;
8001 Py_ssize_t osize;
8002 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008003 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008004 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008005 char *reason = "character maps to <undefined>";
8006 PyObject *errorHandler = NULL;
8007 PyObject *exc = NULL;
8008 /* the following variable is used for caching string comparisons
8009 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8010 * 3=ignore, 4=xmlcharrefreplace */
8011 int known_errorHandler = -1;
8012
Guido van Rossumd57fd912000-03-10 22:53:23 +00008013 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008014 PyErr_BadArgument();
8015 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008016 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008017
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008018 if (PyUnicode_READY(input) == -1)
8019 return NULL;
8020 idata = (char*)PyUnicode_DATA(input);
8021 kind = PyUnicode_KIND(input);
8022 size = PyUnicode_GET_LENGTH(input);
8023 i = 0;
8024
8025 if (size == 0) {
8026 Py_INCREF(input);
8027 return input;
8028 }
8029
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008030 /* allocate enough for a simple 1:1 translation without
8031 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008032 osize = size;
8033 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8034 opos = 0;
8035 if (output == NULL) {
8036 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008037 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008038 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008039
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008040 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008041 /* try to encode it */
8042 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008043 if (charmaptranslate_output(input, i, mapping,
8044 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008045 Py_XDECREF(x);
8046 goto onError;
8047 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008048 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008049 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008050 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008051 else { /* untranslatable character */
8052 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8053 Py_ssize_t repsize;
8054 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008055 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008056 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008057 Py_ssize_t collstart = i;
8058 Py_ssize_t collend = i+1;
8059 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008060
Benjamin Peterson29060642009-01-31 22:14:21 +00008061 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008062 while (collend < size) {
8063 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008064 goto onError;
8065 Py_XDECREF(x);
8066 if (x!=Py_None)
8067 break;
8068 ++collend;
8069 }
8070 /* cache callback name lookup
8071 * (if not done yet, i.e. it's the first error) */
8072 if (known_errorHandler==-1) {
8073 if ((errors==NULL) || (!strcmp(errors, "strict")))
8074 known_errorHandler = 1;
8075 else if (!strcmp(errors, "replace"))
8076 known_errorHandler = 2;
8077 else if (!strcmp(errors, "ignore"))
8078 known_errorHandler = 3;
8079 else if (!strcmp(errors, "xmlcharrefreplace"))
8080 known_errorHandler = 4;
8081 else
8082 known_errorHandler = 0;
8083 }
8084 switch (known_errorHandler) {
8085 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008086 raise_translate_exception(&exc, input, collstart,
8087 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008088 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008089 case 2: /* replace */
8090 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008091 for (coll = collstart; coll<collend; coll++)
8092 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008093 /* fall through */
8094 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008095 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008096 break;
8097 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008098 /* generate replacement (temporarily (mis)uses i) */
8099 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008100 char buffer[2+29+1+1];
8101 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008102 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8103 if (charmaptranslate_makespace(&output, &osize,
8104 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008105 goto onError;
8106 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008107 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008108 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008109 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008110 break;
8111 default:
8112 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008113 reason, input, &exc,
8114 collstart, collend, &newpos);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02008115 if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00008116 goto onError;
8117 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008118 repsize = PyUnicode_GET_LENGTH(repunicode);
8119 if (charmaptranslate_makespace(&output, &osize,
8120 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008121 Py_DECREF(repunicode);
8122 goto onError;
8123 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008124 for (uni2 = 0; repsize-->0; ++uni2)
8125 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8126 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008127 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008128 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008129 }
8130 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008131 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8132 if (!res)
8133 goto onError;
8134 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008135 Py_XDECREF(exc);
8136 Py_XDECREF(errorHandler);
8137 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008138
Benjamin Peterson29060642009-01-31 22:14:21 +00008139 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008140 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008141 Py_XDECREF(exc);
8142 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008143 return NULL;
8144}
8145
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008146/* Deprecated. Use PyUnicode_Translate instead. */
8147PyObject *
8148PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8149 Py_ssize_t size,
8150 PyObject *mapping,
8151 const char *errors)
8152{
8153 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8154 if (!unicode)
8155 return NULL;
8156 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8157}
8158
Alexander Belopolsky40018472011-02-26 01:02:56 +00008159PyObject *
8160PyUnicode_Translate(PyObject *str,
8161 PyObject *mapping,
8162 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008163{
8164 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008165
Guido van Rossumd57fd912000-03-10 22:53:23 +00008166 str = PyUnicode_FromObject(str);
8167 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008168 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008169 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008170 Py_DECREF(str);
8171 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008172
Benjamin Peterson29060642009-01-31 22:14:21 +00008173 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008174 Py_XDECREF(str);
8175 return NULL;
8176}
Tim Petersced69f82003-09-16 20:30:58 +00008177
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008178static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008179fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008180{
8181 /* No need to call PyUnicode_READY(self) because this function is only
8182 called as a callback from fixup() which does it already. */
8183 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8184 const int kind = PyUnicode_KIND(self);
8185 void *data = PyUnicode_DATA(self);
8186 Py_UCS4 maxchar = 0, ch, fixed;
8187 Py_ssize_t i;
8188
8189 for (i = 0; i < len; ++i) {
8190 ch = PyUnicode_READ(kind, data, i);
8191 fixed = 0;
8192 if (ch > 127) {
8193 if (Py_UNICODE_ISSPACE(ch))
8194 fixed = ' ';
8195 else {
8196 const int decimal = Py_UNICODE_TODECIMAL(ch);
8197 if (decimal >= 0)
8198 fixed = '0' + decimal;
8199 }
8200 if (fixed != 0) {
8201 if (fixed > maxchar)
8202 maxchar = fixed;
8203 PyUnicode_WRITE(kind, data, i, fixed);
8204 }
8205 else if (ch > maxchar)
8206 maxchar = ch;
8207 }
8208 else if (ch > maxchar)
8209 maxchar = ch;
8210 }
8211
8212 return maxchar;
8213}
8214
8215PyObject *
8216_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8217{
8218 if (!PyUnicode_Check(unicode)) {
8219 PyErr_BadInternalCall();
8220 return NULL;
8221 }
8222 if (PyUnicode_READY(unicode) == -1)
8223 return NULL;
8224 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8225 /* If the string is already ASCII, just return the same string */
8226 Py_INCREF(unicode);
8227 return unicode;
8228 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008229 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008230}
8231
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008232PyObject *
8233PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8234 Py_ssize_t length)
8235{
8236 PyObject *result;
8237 Py_UNICODE *p; /* write pointer into result */
8238 Py_ssize_t i;
8239 /* Copy to a new string */
8240 result = (PyObject *)_PyUnicode_New(length);
8241 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
8242 if (result == NULL)
8243 return result;
8244 p = PyUnicode_AS_UNICODE(result);
8245 /* Iterate over code points */
8246 for (i = 0; i < length; i++) {
8247 Py_UNICODE ch =s[i];
8248 if (ch > 127) {
8249 int decimal = Py_UNICODE_TODECIMAL(ch);
8250 if (decimal >= 0)
8251 p[i] = '0' + decimal;
8252 }
8253 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008254#ifndef DONT_MAKE_RESULT_READY
8255 if (_PyUnicode_READY_REPLACE(&result)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008256 Py_DECREF(result);
8257 return NULL;
8258 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008259#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02008260 assert(_PyUnicode_CheckConsistency(result, 1));
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008261 return result;
8262}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008263/* --- Decimal Encoder ---------------------------------------------------- */
8264
Alexander Belopolsky40018472011-02-26 01:02:56 +00008265int
8266PyUnicode_EncodeDecimal(Py_UNICODE *s,
8267 Py_ssize_t length,
8268 char *output,
8269 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008270{
8271 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008272 PyObject *errorHandler = NULL;
8273 PyObject *exc = NULL;
8274 const char *encoding = "decimal";
8275 const char *reason = "invalid decimal Unicode string";
8276 /* the following variable is used for caching string comparisons
8277 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
8278 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008279
8280 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008281 PyErr_BadArgument();
8282 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008283 }
8284
8285 p = s;
8286 end = s + length;
8287 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008288 register Py_UNICODE ch = *p;
8289 int decimal;
8290 PyObject *repunicode;
8291 Py_ssize_t repsize;
8292 Py_ssize_t newpos;
8293 Py_UNICODE *uni2;
8294 Py_UNICODE *collstart;
8295 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00008296
Benjamin Peterson29060642009-01-31 22:14:21 +00008297 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008298 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00008299 ++p;
8300 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008301 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008302 decimal = Py_UNICODE_TODECIMAL(ch);
8303 if (decimal >= 0) {
8304 *output++ = '0' + decimal;
8305 ++p;
8306 continue;
8307 }
8308 if (0 < ch && ch < 256) {
8309 *output++ = (char)ch;
8310 ++p;
8311 continue;
8312 }
8313 /* All other characters are considered unencodable */
8314 collstart = p;
8315 collend = p+1;
8316 while (collend < end) {
8317 if ((0 < *collend && *collend < 256) ||
8318 !Py_UNICODE_ISSPACE(*collend) ||
8319 Py_UNICODE_TODECIMAL(*collend))
8320 break;
8321 }
8322 /* cache callback name lookup
8323 * (if not done yet, i.e. it's the first error) */
8324 if (known_errorHandler==-1) {
8325 if ((errors==NULL) || (!strcmp(errors, "strict")))
8326 known_errorHandler = 1;
8327 else if (!strcmp(errors, "replace"))
8328 known_errorHandler = 2;
8329 else if (!strcmp(errors, "ignore"))
8330 known_errorHandler = 3;
8331 else if (!strcmp(errors, "xmlcharrefreplace"))
8332 known_errorHandler = 4;
8333 else
8334 known_errorHandler = 0;
8335 }
8336 switch (known_errorHandler) {
8337 case 1: /* strict */
8338 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
8339 goto onError;
8340 case 2: /* replace */
8341 for (p = collstart; p < collend; ++p)
8342 *output++ = '?';
8343 /* fall through */
8344 case 3: /* ignore */
8345 p = collend;
8346 break;
8347 case 4: /* xmlcharrefreplace */
8348 /* generate replacement (temporarily (mis)uses p) */
8349 for (p = collstart; p < collend; ++p)
8350 output += sprintf(output, "&#%d;", (int)*p);
8351 p = collend;
8352 break;
8353 default:
8354 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
8355 encoding, reason, s, length, &exc,
8356 collstart-s, collend-s, &newpos);
8357 if (repunicode == NULL)
8358 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008359 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008360 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008361 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8362 Py_DECREF(repunicode);
8363 goto onError;
8364 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008365 /* generate replacement */
8366 repsize = PyUnicode_GET_SIZE(repunicode);
8367 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8368 Py_UNICODE ch = *uni2;
8369 if (Py_UNICODE_ISSPACE(ch))
8370 *output++ = ' ';
8371 else {
8372 decimal = Py_UNICODE_TODECIMAL(ch);
8373 if (decimal >= 0)
8374 *output++ = '0' + decimal;
8375 else if (0 < ch && ch < 256)
8376 *output++ = (char)ch;
8377 else {
8378 Py_DECREF(repunicode);
8379 raise_encode_exception(&exc, encoding,
8380 s, length, collstart-s, collend-s, reason);
8381 goto onError;
8382 }
8383 }
8384 }
8385 p = s + newpos;
8386 Py_DECREF(repunicode);
8387 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00008388 }
8389 /* 0-terminate the output string */
8390 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008391 Py_XDECREF(exc);
8392 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008393 return 0;
8394
Benjamin Peterson29060642009-01-31 22:14:21 +00008395 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008396 Py_XDECREF(exc);
8397 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008398 return -1;
8399}
8400
Guido van Rossumd57fd912000-03-10 22:53:23 +00008401/* --- Helpers ------------------------------------------------------------ */
8402
Victor Stinnerc3cec782011-10-05 21:24:08 +02008403#include "stringlib/asciilib.h"
8404#include "stringlib/fastsearch.h"
8405#include "stringlib/partition.h"
8406#include "stringlib/split.h"
8407#include "stringlib/count.h"
8408#include "stringlib/find.h"
8409#include "stringlib/localeutil.h"
8410#include "stringlib/undef.h"
8411
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008412#include "stringlib/ucs1lib.h"
8413#include "stringlib/fastsearch.h"
8414#include "stringlib/partition.h"
8415#include "stringlib/split.h"
8416#include "stringlib/count.h"
8417#include "stringlib/find.h"
8418#include "stringlib/localeutil.h"
8419#include "stringlib/undef.h"
8420
8421#include "stringlib/ucs2lib.h"
8422#include "stringlib/fastsearch.h"
8423#include "stringlib/partition.h"
8424#include "stringlib/split.h"
8425#include "stringlib/count.h"
8426#include "stringlib/find.h"
8427#include "stringlib/localeutil.h"
8428#include "stringlib/undef.h"
8429
8430#include "stringlib/ucs4lib.h"
8431#include "stringlib/fastsearch.h"
8432#include "stringlib/partition.h"
8433#include "stringlib/split.h"
8434#include "stringlib/count.h"
8435#include "stringlib/find.h"
8436#include "stringlib/localeutil.h"
8437#include "stringlib/undef.h"
8438
8439static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02008440any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ascii)(const Py_UCS1*, Py_ssize_t,
8441 const Py_UCS1*, Py_ssize_t,
8442 Py_ssize_t, Py_ssize_t),
8443 Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008444 const Py_UCS1*, Py_ssize_t,
8445 Py_ssize_t, Py_ssize_t),
8446 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
8447 const Py_UCS2*, Py_ssize_t,
8448 Py_ssize_t, Py_ssize_t),
8449 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
8450 const Py_UCS4*, Py_ssize_t,
8451 Py_ssize_t, Py_ssize_t),
8452 PyObject* s1, PyObject* s2,
8453 Py_ssize_t start,
8454 Py_ssize_t end)
8455{
8456 int kind1, kind2, kind;
8457 void *buf1, *buf2;
8458 Py_ssize_t len1, len2, result;
8459
8460 kind1 = PyUnicode_KIND(s1);
8461 kind2 = PyUnicode_KIND(s2);
8462 kind = kind1 > kind2 ? kind1 : kind2;
8463 buf1 = PyUnicode_DATA(s1);
8464 buf2 = PyUnicode_DATA(s2);
8465 if (kind1 != kind)
8466 buf1 = _PyUnicode_AsKind(s1, kind);
8467 if (!buf1)
8468 return -2;
8469 if (kind2 != kind)
8470 buf2 = _PyUnicode_AsKind(s2, kind);
8471 if (!buf2) {
8472 if (kind1 != kind) PyMem_Free(buf1);
8473 return -2;
8474 }
8475 len1 = PyUnicode_GET_LENGTH(s1);
8476 len2 = PyUnicode_GET_LENGTH(s2);
8477
8478 switch(kind) {
8479 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008480 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8481 result = ascii(buf1, len1, buf2, len2, start, end);
8482 else
8483 result = ucs1(buf1, len1, buf2, len2, start, end);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008484 break;
8485 case PyUnicode_2BYTE_KIND:
8486 result = ucs2(buf1, len1, buf2, len2, start, end);
8487 break;
8488 case PyUnicode_4BYTE_KIND:
8489 result = ucs4(buf1, len1, buf2, len2, start, end);
8490 break;
8491 default:
8492 assert(0); result = -2;
8493 }
8494
8495 if (kind1 != kind)
8496 PyMem_Free(buf1);
8497 if (kind2 != kind)
8498 PyMem_Free(buf2);
8499
8500 return result;
8501}
8502
8503Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02008504_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008505 Py_ssize_t n_buffer,
8506 void *digits, Py_ssize_t n_digits,
8507 Py_ssize_t min_width,
8508 const char *grouping,
8509 const char *thousands_sep)
8510{
8511 switch(kind) {
8512 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008513 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
8514 return _PyUnicode_ascii_InsertThousandsGrouping(
8515 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8516 min_width, grouping, thousands_sep);
8517 else
8518 return _PyUnicode_ucs1_InsertThousandsGrouping(
8519 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8520 min_width, grouping, thousands_sep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008521 case PyUnicode_2BYTE_KIND:
8522 return _PyUnicode_ucs2_InsertThousandsGrouping(
8523 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8524 min_width, grouping, thousands_sep);
8525 case PyUnicode_4BYTE_KIND:
8526 return _PyUnicode_ucs4_InsertThousandsGrouping(
8527 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8528 min_width, grouping, thousands_sep);
8529 }
8530 assert(0);
8531 return -1;
8532}
8533
8534
Eric Smith8c663262007-08-25 02:26:07 +00008535#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00008536#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008537
Thomas Wouters477c8d52006-05-27 19:21:47 +00008538#include "stringlib/count.h"
8539#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00008540
Thomas Wouters477c8d52006-05-27 19:21:47 +00008541/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008542#define ADJUST_INDICES(start, end, len) \
8543 if (end > len) \
8544 end = len; \
8545 else if (end < 0) { \
8546 end += len; \
8547 if (end < 0) \
8548 end = 0; \
8549 } \
8550 if (start < 0) { \
8551 start += len; \
8552 if (start < 0) \
8553 start = 0; \
8554 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008555
Alexander Belopolsky40018472011-02-26 01:02:56 +00008556Py_ssize_t
8557PyUnicode_Count(PyObject *str,
8558 PyObject *substr,
8559 Py_ssize_t start,
8560 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008561{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008562 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008563 PyUnicodeObject* str_obj;
8564 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008565 int kind1, kind2, kind;
8566 void *buf1 = NULL, *buf2 = NULL;
8567 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008568
Thomas Wouters477c8d52006-05-27 19:21:47 +00008569 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008570 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008571 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008572 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02008573 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008574 Py_DECREF(str_obj);
8575 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008576 }
Tim Petersced69f82003-09-16 20:30:58 +00008577
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008578 kind1 = PyUnicode_KIND(str_obj);
8579 kind2 = PyUnicode_KIND(sub_obj);
8580 kind = kind1 > kind2 ? kind1 : kind2;
8581 buf1 = PyUnicode_DATA(str_obj);
8582 if (kind1 != kind)
8583 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
8584 if (!buf1)
8585 goto onError;
8586 buf2 = PyUnicode_DATA(sub_obj);
8587 if (kind2 != kind)
8588 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
8589 if (!buf2)
8590 goto onError;
8591 len1 = PyUnicode_GET_LENGTH(str_obj);
8592 len2 = PyUnicode_GET_LENGTH(sub_obj);
8593
8594 ADJUST_INDICES(start, end, len1);
8595 switch(kind) {
8596 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008597 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
8598 result = asciilib_count(
8599 ((Py_UCS1*)buf1) + start, end - start,
8600 buf2, len2, PY_SSIZE_T_MAX
8601 );
8602 else
8603 result = ucs1lib_count(
8604 ((Py_UCS1*)buf1) + start, end - start,
8605 buf2, len2, PY_SSIZE_T_MAX
8606 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008607 break;
8608 case PyUnicode_2BYTE_KIND:
8609 result = ucs2lib_count(
8610 ((Py_UCS2*)buf1) + start, end - start,
8611 buf2, len2, PY_SSIZE_T_MAX
8612 );
8613 break;
8614 case PyUnicode_4BYTE_KIND:
8615 result = ucs4lib_count(
8616 ((Py_UCS4*)buf1) + start, end - start,
8617 buf2, len2, PY_SSIZE_T_MAX
8618 );
8619 break;
8620 default:
8621 assert(0); result = 0;
8622 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008623
8624 Py_DECREF(sub_obj);
8625 Py_DECREF(str_obj);
8626
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008627 if (kind1 != kind)
8628 PyMem_Free(buf1);
8629 if (kind2 != kind)
8630 PyMem_Free(buf2);
8631
Guido van Rossumd57fd912000-03-10 22:53:23 +00008632 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008633 onError:
8634 Py_DECREF(sub_obj);
8635 Py_DECREF(str_obj);
8636 if (kind1 != kind && buf1)
8637 PyMem_Free(buf1);
8638 if (kind2 != kind && buf2)
8639 PyMem_Free(buf2);
8640 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008641}
8642
Alexander Belopolsky40018472011-02-26 01:02:56 +00008643Py_ssize_t
8644PyUnicode_Find(PyObject *str,
8645 PyObject *sub,
8646 Py_ssize_t start,
8647 Py_ssize_t end,
8648 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008649{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008650 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008651
Guido van Rossumd57fd912000-03-10 22:53:23 +00008652 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008653 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008654 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008655 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008656 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008657 Py_DECREF(str);
8658 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008659 }
Tim Petersced69f82003-09-16 20:30:58 +00008660
Thomas Wouters477c8d52006-05-27 19:21:47 +00008661 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008662 result = any_find_slice(
Victor Stinnerc3cec782011-10-05 21:24:08 +02008663 asciilib_find_slice, ucs1lib_find_slice,
8664 ucs2lib_find_slice, ucs4lib_find_slice,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008665 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008666 );
8667 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008668 result = any_find_slice(
Victor Stinnerc3cec782011-10-05 21:24:08 +02008669 asciilib_find_slice, ucs1lib_rfind_slice,
8670 ucs2lib_rfind_slice, ucs4lib_rfind_slice,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008671 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008672 );
8673
Guido van Rossumd57fd912000-03-10 22:53:23 +00008674 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008675 Py_DECREF(sub);
8676
Guido van Rossumd57fd912000-03-10 22:53:23 +00008677 return result;
8678}
8679
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008680Py_ssize_t
8681PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8682 Py_ssize_t start, Py_ssize_t end,
8683 int direction)
8684{
8685 char *result;
8686 int kind;
8687 if (PyUnicode_READY(str) == -1)
8688 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008689 if (start < 0 || end < 0) {
8690 PyErr_SetString(PyExc_IndexError, "string index out of range");
8691 return -2;
8692 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008693 if (end > PyUnicode_GET_LENGTH(str))
8694 end = PyUnicode_GET_LENGTH(str);
8695 kind = PyUnicode_KIND(str);
8696 result = findchar(PyUnicode_1BYTE_DATA(str)
8697 + PyUnicode_KIND_SIZE(kind, start),
8698 kind,
8699 end-start, ch, direction);
8700 if (!result)
8701 return -1;
8702 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8703}
8704
Alexander Belopolsky40018472011-02-26 01:02:56 +00008705static int
8706tailmatch(PyUnicodeObject *self,
8707 PyUnicodeObject *substring,
8708 Py_ssize_t start,
8709 Py_ssize_t end,
8710 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008711{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008712 int kind_self;
8713 int kind_sub;
8714 void *data_self;
8715 void *data_sub;
8716 Py_ssize_t offset;
8717 Py_ssize_t i;
8718 Py_ssize_t end_sub;
8719
8720 if (PyUnicode_READY(self) == -1 ||
8721 PyUnicode_READY(substring) == -1)
8722 return 0;
8723
8724 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008725 return 1;
8726
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008727 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8728 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008729 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008730 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008731
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008732 kind_self = PyUnicode_KIND(self);
8733 data_self = PyUnicode_DATA(self);
8734 kind_sub = PyUnicode_KIND(substring);
8735 data_sub = PyUnicode_DATA(substring);
8736 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8737
8738 if (direction > 0)
8739 offset = end;
8740 else
8741 offset = start;
8742
8743 if (PyUnicode_READ(kind_self, data_self, offset) ==
8744 PyUnicode_READ(kind_sub, data_sub, 0) &&
8745 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8746 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8747 /* If both are of the same kind, memcmp is sufficient */
8748 if (kind_self == kind_sub) {
8749 return ! memcmp((char *)data_self +
8750 (offset * PyUnicode_CHARACTER_SIZE(substring)),
8751 data_sub,
8752 PyUnicode_GET_LENGTH(substring) *
8753 PyUnicode_CHARACTER_SIZE(substring));
8754 }
8755 /* otherwise we have to compare each character by first accesing it */
8756 else {
8757 /* We do not need to compare 0 and len(substring)-1 because
8758 the if statement above ensured already that they are equal
8759 when we end up here. */
8760 // TODO: honor direction and do a forward or backwards search
8761 for (i = 1; i < end_sub; ++i) {
8762 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8763 PyUnicode_READ(kind_sub, data_sub, i))
8764 return 0;
8765 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008766 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008767 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008768 }
8769
8770 return 0;
8771}
8772
Alexander Belopolsky40018472011-02-26 01:02:56 +00008773Py_ssize_t
8774PyUnicode_Tailmatch(PyObject *str,
8775 PyObject *substr,
8776 Py_ssize_t start,
8777 Py_ssize_t end,
8778 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008779{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008780 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008781
Guido van Rossumd57fd912000-03-10 22:53:23 +00008782 str = PyUnicode_FromObject(str);
8783 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008784 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008785 substr = PyUnicode_FromObject(substr);
8786 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008787 Py_DECREF(str);
8788 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008789 }
Tim Petersced69f82003-09-16 20:30:58 +00008790
Guido van Rossumd57fd912000-03-10 22:53:23 +00008791 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008792 (PyUnicodeObject *)substr,
8793 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008794 Py_DECREF(str);
8795 Py_DECREF(substr);
8796 return result;
8797}
8798
Guido van Rossumd57fd912000-03-10 22:53:23 +00008799/* Apply fixfct filter to the Unicode object self and return a
8800 reference to the modified object */
8801
Alexander Belopolsky40018472011-02-26 01:02:56 +00008802static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02008803fixup(PyObject *self,
8804 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008805{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008806 PyObject *u;
8807 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008808
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008809 if (PyUnicode_READY(self) == -1)
8810 return NULL;
8811 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8812 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8813 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008814 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008815 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008816
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008817 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8818 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008819
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008820 /* fix functions return the new maximum character in a string,
8821 if the kind of the resulting unicode object does not change,
8822 everything is fine. Otherwise we need to change the string kind
8823 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02008824 maxchar_new = fixfct(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008825 if (maxchar_new == 0)
8826 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8827 else if (maxchar_new <= 127)
8828 maxchar_new = 127;
8829 else if (maxchar_new <= 255)
8830 maxchar_new = 255;
8831 else if (maxchar_new <= 65535)
8832 maxchar_new = 65535;
8833 else
8834 maxchar_new = 1114111; /* 0x10ffff */
8835
8836 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008837 /* fixfct should return TRUE if it modified the buffer. If
8838 FALSE, return a reference to the original buffer instead
8839 (to save space, not time) */
8840 Py_INCREF(self);
8841 Py_DECREF(u);
8842 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008843 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008844 else if (maxchar_new == maxchar_old) {
8845 return u;
8846 }
8847 else {
8848 /* In case the maximum character changed, we need to
8849 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008850 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008851 if (v == NULL) {
8852 Py_DECREF(u);
8853 return NULL;
8854 }
8855 if (maxchar_new > maxchar_old) {
8856 /* If the maxchar increased so that the kind changed, not all
8857 characters are representable anymore and we need to fix the
8858 string again. This only happens in very few cases. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02008859 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner9310abb2011-10-05 00:59:23 +02008860 maxchar_old = fixfct(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008861 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8862 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008863 else {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02008864 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008865 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008866
8867 Py_DECREF(u);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02008868 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008869 return v;
8870 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008871}
8872
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008873static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008874fixupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008875{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008876 /* No need to call PyUnicode_READY(self) because this function is only
8877 called as a callback from fixup() which does it already. */
8878 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8879 const int kind = PyUnicode_KIND(self);
8880 void *data = PyUnicode_DATA(self);
8881 int touched = 0;
8882 Py_UCS4 maxchar = 0;
8883 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008884
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008885 for (i = 0; i < len; ++i) {
8886 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8887 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8888 if (up != ch) {
8889 if (up > maxchar)
8890 maxchar = up;
8891 PyUnicode_WRITE(kind, data, i, up);
8892 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008893 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008894 else if (ch > maxchar)
8895 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008896 }
8897
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008898 if (touched)
8899 return maxchar;
8900 else
8901 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008902}
8903
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008904static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008905fixlower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008906{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008907 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8908 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8909 const int kind = PyUnicode_KIND(self);
8910 void *data = PyUnicode_DATA(self);
8911 int touched = 0;
8912 Py_UCS4 maxchar = 0;
8913 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008914
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008915 for(i = 0; i < len; ++i) {
8916 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8917 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8918 if (lo != ch) {
8919 if (lo > maxchar)
8920 maxchar = lo;
8921 PyUnicode_WRITE(kind, data, i, lo);
8922 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008923 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008924 else if (ch > maxchar)
8925 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008926 }
8927
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008928 if (touched)
8929 return maxchar;
8930 else
8931 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008932}
8933
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008934static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008935fixswapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008936{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008937 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8938 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8939 const int kind = PyUnicode_KIND(self);
8940 void *data = PyUnicode_DATA(self);
8941 int touched = 0;
8942 Py_UCS4 maxchar = 0;
8943 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008944
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008945 for(i = 0; i < len; ++i) {
8946 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8947 Py_UCS4 nu = 0;
8948
8949 if (Py_UNICODE_ISUPPER(ch))
8950 nu = Py_UNICODE_TOLOWER(ch);
8951 else if (Py_UNICODE_ISLOWER(ch))
8952 nu = Py_UNICODE_TOUPPER(ch);
8953
8954 if (nu != 0) {
8955 if (nu > maxchar)
8956 maxchar = nu;
8957 PyUnicode_WRITE(kind, data, i, nu);
8958 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008959 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008960 else if (ch > maxchar)
8961 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008962 }
8963
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008964 if (touched)
8965 return maxchar;
8966 else
8967 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008968}
8969
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008970static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008971fixcapitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008972{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008973 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8974 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8975 const int kind = PyUnicode_KIND(self);
8976 void *data = PyUnicode_DATA(self);
8977 int touched = 0;
8978 Py_UCS4 maxchar = 0;
8979 Py_ssize_t i = 0;
8980 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00008981
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008982 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008983 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008984
8985 ch = PyUnicode_READ(kind, data, i);
8986 if (!Py_UNICODE_ISUPPER(ch)) {
8987 maxchar = Py_UNICODE_TOUPPER(ch);
8988 PyUnicode_WRITE(kind, data, i, maxchar);
8989 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008990 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008991 ++i;
8992 for(; i < len; ++i) {
8993 ch = PyUnicode_READ(kind, data, i);
8994 if (!Py_UNICODE_ISLOWER(ch)) {
8995 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8996 if (lo > maxchar)
8997 maxchar = lo;
8998 PyUnicode_WRITE(kind, data, i, lo);
8999 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009000 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009001 else if (ch > maxchar)
9002 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009003 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009004
9005 if (touched)
9006 return maxchar;
9007 else
9008 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009009}
9010
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009011static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009012fixtitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009013{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009014 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9015 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9016 const int kind = PyUnicode_KIND(self);
9017 void *data = PyUnicode_DATA(self);
9018 Py_UCS4 maxchar = 0;
9019 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009020 int previous_is_cased;
9021
9022 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009023 if (len == 1) {
9024 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9025 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
9026 if (ti != ch) {
9027 PyUnicode_WRITE(kind, data, i, ti);
9028 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00009029 }
9030 else
9031 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009032 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009033 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009034 for(; i < len; ++i) {
9035 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9036 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00009037
Benjamin Peterson29060642009-01-31 22:14:21 +00009038 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009039 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00009040 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009041 nu = Py_UNICODE_TOTITLE(ch);
9042
9043 if (nu > maxchar)
9044 maxchar = nu;
9045 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00009046
Benjamin Peterson29060642009-01-31 22:14:21 +00009047 if (Py_UNICODE_ISLOWER(ch) ||
9048 Py_UNICODE_ISUPPER(ch) ||
9049 Py_UNICODE_ISTITLE(ch))
9050 previous_is_cased = 1;
9051 else
9052 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009053 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009054 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009055}
9056
Tim Peters8ce9f162004-08-27 01:49:32 +00009057PyObject *
9058PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009059{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009060 PyObject *sep = NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009061 Py_ssize_t seplen = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009062 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009063 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009064 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9065 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009066 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009067 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009068 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009069 Py_UCS4 item_maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009070
Tim Peters05eba1f2004-08-27 21:32:02 +00009071 fseq = PySequence_Fast(seq, "");
9072 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009073 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009074 }
9075
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009076 /* NOTE: the following code can't call back into Python code,
9077 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009078 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009079
Tim Peters05eba1f2004-08-27 21:32:02 +00009080 seqlen = PySequence_Fast_GET_SIZE(fseq);
9081 /* If empty sequence, return u"". */
9082 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009083 Py_DECREF(fseq);
9084 Py_INCREF(unicode_empty);
9085 res = unicode_empty;
9086 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009087 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009088
Tim Peters05eba1f2004-08-27 21:32:02 +00009089 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009090 items = PySequence_Fast_ITEMS(fseq);
9091 if (seqlen == 1 && PyUnicode_CheckExact(items[0])) {
9092 res = items[0];
9093 Py_INCREF(res);
9094 Py_DECREF(fseq);
9095 return res;
9096 }
9097
9098 /* Set up sep and seplen */
9099 if (separator == NULL) {
9100 /* fall back to a blank space separator */
9101 sep = PyUnicode_FromOrdinal(' ');
9102 if (!sep)
9103 goto onError;
9104 maxchar = 32;
Tim Peters8ce9f162004-08-27 01:49:32 +00009105 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009106 else {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009107 if (!PyUnicode_Check(separator)) {
9108 PyErr_Format(PyExc_TypeError,
9109 "separator: expected str instance,"
9110 " %.80s found",
9111 Py_TYPE(separator)->tp_name);
9112 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00009113 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009114 if (PyUnicode_READY(separator))
9115 goto onError;
9116 sep = separator;
9117 seplen = PyUnicode_GET_LENGTH(separator);
9118 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9119 /* inc refcount to keep this code path symmetric with the
9120 above case of a blank separator */
9121 Py_INCREF(sep);
Tim Peters05eba1f2004-08-27 21:32:02 +00009122 }
9123
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009124 /* There are at least two things to join, or else we have a subclass
9125 * of str in the sequence.
9126 * Do a pre-pass to figure out the total amount of space we'll
9127 * need (sz), and see whether all argument are strings.
9128 */
9129 sz = 0;
9130 for (i = 0; i < seqlen; i++) {
9131 const Py_ssize_t old_sz = sz;
9132 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009133 if (!PyUnicode_Check(item)) {
9134 PyErr_Format(PyExc_TypeError,
9135 "sequence item %zd: expected str instance,"
9136 " %.80s found",
9137 i, Py_TYPE(item)->tp_name);
9138 goto onError;
9139 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009140 if (PyUnicode_READY(item) == -1)
9141 goto onError;
9142 sz += PyUnicode_GET_LENGTH(item);
9143 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
9144 if (item_maxchar > maxchar)
9145 maxchar = item_maxchar;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009146 if (i != 0)
9147 sz += seplen;
9148 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9149 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009150 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009151 goto onError;
9152 }
9153 }
Tim Petersced69f82003-09-16 20:30:58 +00009154
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009155 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009156 if (res == NULL)
9157 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009158
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009159 /* Catenate everything. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009160 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009161 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009162 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009163 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009164 if (i && seplen != 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009165 copy_characters(res, res_offset, sep, 0, seplen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009166 res_offset += seplen;
Benjamin Peterson29060642009-01-31 22:14:21 +00009167 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009168 itemlen = PyUnicode_GET_LENGTH(item);
9169 if (itemlen != 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009170 copy_characters(res, res_offset, item, 0, itemlen);
Victor Stinner9ce5a832011-10-03 23:36:02 +02009171 res_offset += itemlen;
Victor Stinner9ce5a832011-10-03 23:36:02 +02009172 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009173 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009174 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009175
Tim Peters05eba1f2004-08-27 21:32:02 +00009176 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009177 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009178 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009179 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009180
Benjamin Peterson29060642009-01-31 22:14:21 +00009181 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009182 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009183 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009184 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009185 return NULL;
9186}
9187
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009188#define FILL(kind, data, value, start, length) \
9189 do { \
9190 Py_ssize_t i_ = 0; \
9191 assert(kind != PyUnicode_WCHAR_KIND); \
9192 switch ((kind)) { \
9193 case PyUnicode_1BYTE_KIND: { \
9194 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9195 memset(to_, (unsigned char)value, length); \
9196 break; \
9197 } \
9198 case PyUnicode_2BYTE_KIND: { \
9199 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9200 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9201 break; \
9202 } \
9203 default: { \
9204 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9205 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9206 break; \
9207 } \
9208 } \
9209 } while (0)
9210
Victor Stinner9310abb2011-10-05 00:59:23 +02009211static PyObject *
9212pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009213 Py_ssize_t left,
9214 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009215 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009216{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009217 PyObject *u;
9218 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009219 int kind;
9220 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009221
9222 if (left < 0)
9223 left = 0;
9224 if (right < 0)
9225 right = 0;
9226
Tim Peters7a29bd52001-09-12 03:03:31 +00009227 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009228 Py_INCREF(self);
9229 return self;
9230 }
9231
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009232 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9233 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009234 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9235 return NULL;
9236 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009237 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9238 if (fill > maxchar)
9239 maxchar = fill;
9240 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009241 if (!u)
9242 return NULL;
9243
9244 kind = PyUnicode_KIND(u);
9245 data = PyUnicode_DATA(u);
9246 if (left)
9247 FILL(kind, data, fill, 0, left);
9248 if (right)
9249 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009250 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009251 assert(_PyUnicode_CheckConsistency(u, 1));
9252 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009253}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009254#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009255
Alexander Belopolsky40018472011-02-26 01:02:56 +00009256PyObject *
9257PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009258{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009259 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009260
9261 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009262 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009263 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009264
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009265 switch(PyUnicode_KIND(string)) {
9266 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009267 if (PyUnicode_IS_ASCII(string))
9268 list = asciilib_splitlines(
9269 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9270 PyUnicode_GET_LENGTH(string), keepends);
9271 else
9272 list = ucs1lib_splitlines(
9273 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9274 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009275 break;
9276 case PyUnicode_2BYTE_KIND:
9277 list = ucs2lib_splitlines(
9278 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
9279 PyUnicode_GET_LENGTH(string), keepends);
9280 break;
9281 case PyUnicode_4BYTE_KIND:
9282 list = ucs4lib_splitlines(
9283 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
9284 PyUnicode_GET_LENGTH(string), keepends);
9285 break;
9286 default:
9287 assert(0);
9288 list = 0;
9289 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009290 Py_DECREF(string);
9291 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009292}
9293
Alexander Belopolsky40018472011-02-26 01:02:56 +00009294static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009295split(PyObject *self,
9296 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009297 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009298{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009299 int kind1, kind2, kind;
9300 void *buf1, *buf2;
9301 Py_ssize_t len1, len2;
9302 PyObject* out;
9303
Guido van Rossumd57fd912000-03-10 22:53:23 +00009304 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009305 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009306
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009307 if (PyUnicode_READY(self) == -1)
9308 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009309
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009310 if (substring == NULL)
9311 switch(PyUnicode_KIND(self)) {
9312 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009313 if (PyUnicode_IS_ASCII(self))
9314 return asciilib_split_whitespace(
9315 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9316 PyUnicode_GET_LENGTH(self), maxcount
9317 );
9318 else
9319 return ucs1lib_split_whitespace(
9320 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9321 PyUnicode_GET_LENGTH(self), maxcount
9322 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009323 case PyUnicode_2BYTE_KIND:
9324 return ucs2lib_split_whitespace(
9325 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9326 PyUnicode_GET_LENGTH(self), maxcount
9327 );
9328 case PyUnicode_4BYTE_KIND:
9329 return ucs4lib_split_whitespace(
9330 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9331 PyUnicode_GET_LENGTH(self), maxcount
9332 );
9333 default:
9334 assert(0);
9335 return NULL;
9336 }
9337
9338 if (PyUnicode_READY(substring) == -1)
9339 return NULL;
9340
9341 kind1 = PyUnicode_KIND(self);
9342 kind2 = PyUnicode_KIND(substring);
9343 kind = kind1 > kind2 ? kind1 : kind2;
9344 buf1 = PyUnicode_DATA(self);
9345 buf2 = PyUnicode_DATA(substring);
9346 if (kind1 != kind)
9347 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9348 if (!buf1)
9349 return NULL;
9350 if (kind2 != kind)
9351 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9352 if (!buf2) {
9353 if (kind1 != kind) PyMem_Free(buf1);
9354 return NULL;
9355 }
9356 len1 = PyUnicode_GET_LENGTH(self);
9357 len2 = PyUnicode_GET_LENGTH(substring);
9358
9359 switch(kind) {
9360 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009361 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9362 out = asciilib_split(
9363 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9364 else
9365 out = ucs1lib_split(
9366 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009367 break;
9368 case PyUnicode_2BYTE_KIND:
9369 out = ucs2lib_split(
9370 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9371 break;
9372 case PyUnicode_4BYTE_KIND:
9373 out = ucs4lib_split(
9374 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9375 break;
9376 default:
9377 out = NULL;
9378 }
9379 if (kind1 != kind)
9380 PyMem_Free(buf1);
9381 if (kind2 != kind)
9382 PyMem_Free(buf2);
9383 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009384}
9385
Alexander Belopolsky40018472011-02-26 01:02:56 +00009386static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009387rsplit(PyObject *self,
9388 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009389 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009390{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009391 int kind1, kind2, kind;
9392 void *buf1, *buf2;
9393 Py_ssize_t len1, len2;
9394 PyObject* out;
9395
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009396 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009397 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009398
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009399 if (PyUnicode_READY(self) == -1)
9400 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009401
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009402 if (substring == NULL)
9403 switch(PyUnicode_KIND(self)) {
9404 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009405 if (PyUnicode_IS_ASCII(self))
9406 return asciilib_rsplit_whitespace(
9407 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9408 PyUnicode_GET_LENGTH(self), maxcount
9409 );
9410 else
9411 return ucs1lib_rsplit_whitespace(
9412 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9413 PyUnicode_GET_LENGTH(self), maxcount
9414 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009415 case PyUnicode_2BYTE_KIND:
9416 return ucs2lib_rsplit_whitespace(
9417 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9418 PyUnicode_GET_LENGTH(self), maxcount
9419 );
9420 case PyUnicode_4BYTE_KIND:
9421 return ucs4lib_rsplit_whitespace(
9422 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9423 PyUnicode_GET_LENGTH(self), maxcount
9424 );
9425 default:
9426 assert(0);
9427 return NULL;
9428 }
9429
9430 if (PyUnicode_READY(substring) == -1)
9431 return NULL;
9432
9433 kind1 = PyUnicode_KIND(self);
9434 kind2 = PyUnicode_KIND(substring);
9435 kind = kind1 > kind2 ? kind1 : kind2;
9436 buf1 = PyUnicode_DATA(self);
9437 buf2 = PyUnicode_DATA(substring);
9438 if (kind1 != kind)
9439 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9440 if (!buf1)
9441 return NULL;
9442 if (kind2 != kind)
9443 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9444 if (!buf2) {
9445 if (kind1 != kind) PyMem_Free(buf1);
9446 return NULL;
9447 }
9448 len1 = PyUnicode_GET_LENGTH(self);
9449 len2 = PyUnicode_GET_LENGTH(substring);
9450
9451 switch(kind) {
9452 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009453 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9454 out = asciilib_rsplit(
9455 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9456 else
9457 out = ucs1lib_rsplit(
9458 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009459 break;
9460 case PyUnicode_2BYTE_KIND:
9461 out = ucs2lib_rsplit(
9462 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9463 break;
9464 case PyUnicode_4BYTE_KIND:
9465 out = ucs4lib_rsplit(
9466 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9467 break;
9468 default:
9469 out = NULL;
9470 }
9471 if (kind1 != kind)
9472 PyMem_Free(buf1);
9473 if (kind2 != kind)
9474 PyMem_Free(buf2);
9475 return out;
9476}
9477
9478static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009479anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9480 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009481{
9482 switch(kind) {
9483 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009484 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9485 return asciilib_find(buf1, len1, buf2, len2, offset);
9486 else
9487 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009488 case PyUnicode_2BYTE_KIND:
9489 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9490 case PyUnicode_4BYTE_KIND:
9491 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9492 }
9493 assert(0);
9494 return -1;
9495}
9496
9497static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009498anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
9499 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009500{
9501 switch(kind) {
9502 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009503 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
9504 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
9505 else
9506 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009507 case PyUnicode_2BYTE_KIND:
9508 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9509 case PyUnicode_4BYTE_KIND:
9510 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9511 }
9512 assert(0);
9513 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009514}
9515
Alexander Belopolsky40018472011-02-26 01:02:56 +00009516static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009517replace(PyObject *self, PyObject *str1,
9518 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009519{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009520 PyObject *u;
9521 char *sbuf = PyUnicode_DATA(self);
9522 char *buf1 = PyUnicode_DATA(str1);
9523 char *buf2 = PyUnicode_DATA(str2);
9524 int srelease = 0, release1 = 0, release2 = 0;
9525 int skind = PyUnicode_KIND(self);
9526 int kind1 = PyUnicode_KIND(str1);
9527 int kind2 = PyUnicode_KIND(str2);
9528 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9529 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9530 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009531
9532 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009533 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009534 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009535 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009536
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009537 if (skind < kind1)
9538 /* substring too wide to be present */
9539 goto nothing;
9540
9541 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00009542 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009543 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009544 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009545 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009546 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009547 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009548 Py_UCS4 u1, u2, maxchar;
9549 int mayshrink, rkind;
9550 u1 = PyUnicode_READ_CHAR(str1, 0);
9551 if (!findchar(sbuf, PyUnicode_KIND(self),
9552 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00009553 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009554 u2 = PyUnicode_READ_CHAR(str2, 0);
9555 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9556 /* Replacing u1 with u2 may cause a maxchar reduction in the
9557 result string. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009558 if (u2 > maxchar) {
9559 maxchar = u2;
9560 mayshrink = 0;
9561 }
Victor Stinnerb9275c12011-10-05 14:01:42 +02009562 else
9563 mayshrink = maxchar > 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009564 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009565 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009566 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009567 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009568 rkind = PyUnicode_KIND(u);
9569 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
9570 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009571 if (--maxcount < 0)
9572 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009573 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009574 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009575 if (mayshrink) {
9576 PyObject *tmp = u;
9577 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
9578 PyUnicode_GET_LENGTH(tmp));
9579 Py_DECREF(tmp);
9580 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009581 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009582 int rkind = skind;
9583 char *res;
9584 if (kind1 < rkind) {
9585 /* widen substring */
9586 buf1 = _PyUnicode_AsKind(str1, rkind);
9587 if (!buf1) goto error;
9588 release1 = 1;
9589 }
Victor Stinnerc3cec782011-10-05 21:24:08 +02009590 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009591 if (i < 0)
9592 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009593 if (rkind > kind2) {
9594 /* widen replacement */
9595 buf2 = _PyUnicode_AsKind(str2, rkind);
9596 if (!buf2) goto error;
9597 release2 = 1;
9598 }
9599 else if (rkind < kind2) {
9600 /* widen self and buf1 */
9601 rkind = kind2;
9602 if (release1) PyMem_Free(buf1);
9603 sbuf = _PyUnicode_AsKind(self, rkind);
9604 if (!sbuf) goto error;
9605 srelease = 1;
9606 buf1 = _PyUnicode_AsKind(str1, rkind);
9607 if (!buf1) goto error;
9608 release1 = 1;
9609 }
9610 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
9611 if (!res) {
9612 PyErr_NoMemory();
9613 goto error;
9614 }
9615 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009616 /* change everything in-place, starting with this one */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009617 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9618 buf2,
9619 PyUnicode_KIND_SIZE(rkind, len2));
9620 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009621
9622 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +02009623 i = anylib_find(rkind, self,
9624 sbuf+PyUnicode_KIND_SIZE(rkind, i), slen-i,
9625 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009626 if (i == -1)
9627 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009628 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9629 buf2,
9630 PyUnicode_KIND_SIZE(rkind, len2));
9631 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009632 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009633
9634 u = PyUnicode_FromKindAndData(rkind, res, slen);
9635 PyMem_Free(res);
9636 if (!u) goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009637 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009638 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009639
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009640 Py_ssize_t n, i, j, ires;
9641 Py_ssize_t product, new_size;
9642 int rkind = skind;
9643 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009644
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009645 if (kind1 < rkind) {
9646 buf1 = _PyUnicode_AsKind(str1, rkind);
9647 if (!buf1) goto error;
9648 release1 = 1;
9649 }
Victor Stinnerc3cec782011-10-05 21:24:08 +02009650 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009651 if (n == 0)
9652 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009653 if (kind2 < rkind) {
9654 buf2 = _PyUnicode_AsKind(str2, rkind);
9655 if (!buf2) goto error;
9656 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009657 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009658 else if (kind2 > rkind) {
9659 rkind = kind2;
9660 sbuf = _PyUnicode_AsKind(self, rkind);
9661 if (!sbuf) goto error;
9662 srelease = 1;
9663 if (release1) PyMem_Free(buf1);
9664 buf1 = _PyUnicode_AsKind(str1, rkind);
9665 if (!buf1) goto error;
9666 release1 = 1;
9667 }
9668 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9669 PyUnicode_GET_LENGTH(str1))); */
9670 product = n * (len2-len1);
9671 if ((product / (len2-len1)) != n) {
9672 PyErr_SetString(PyExc_OverflowError,
9673 "replace string is too long");
9674 goto error;
9675 }
9676 new_size = slen + product;
9677 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9678 PyErr_SetString(PyExc_OverflowError,
9679 "replace string is too long");
9680 goto error;
9681 }
9682 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
9683 if (!res)
9684 goto error;
9685 ires = i = 0;
9686 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009687 while (n-- > 0) {
9688 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +02009689 j = anylib_find(rkind, self,
9690 sbuf + PyUnicode_KIND_SIZE(rkind, i), slen-i,
9691 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009692 if (j == -1)
9693 break;
9694 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009695 /* copy unchanged part [i:j] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009696 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9697 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9698 PyUnicode_KIND_SIZE(rkind, j-i));
9699 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009700 }
9701 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009702 if (len2 > 0) {
9703 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9704 buf2,
9705 PyUnicode_KIND_SIZE(rkind, len2));
9706 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009707 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009708 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009709 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009710 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009711 /* copy tail [i:] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009712 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9713 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9714 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009715 } else {
9716 /* interleave */
9717 while (n > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009718 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9719 buf2,
9720 PyUnicode_KIND_SIZE(rkind, len2));
9721 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009722 if (--n <= 0)
9723 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009724 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9725 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9726 PyUnicode_KIND_SIZE(rkind, 1));
9727 ires++;
9728 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009729 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009730 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9731 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9732 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009733 }
Victor Stinnerf48323e2011-10-05 23:27:08 +02009734 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(str2))
9735 u = unicode_fromascii((unsigned char*)res, new_size);
9736 else
9737 u = PyUnicode_FromKindAndData(rkind, res, new_size);
Martin v. Löwis0b1d3482011-10-01 16:35:40 +02009738 PyMem_Free(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009739 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009740 if (srelease)
9741 PyMem_FREE(sbuf);
9742 if (release1)
9743 PyMem_FREE(buf1);
9744 if (release2)
9745 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009746 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009747 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009748
Benjamin Peterson29060642009-01-31 22:14:21 +00009749 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009750 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009751 if (srelease)
9752 PyMem_FREE(sbuf);
9753 if (release1)
9754 PyMem_FREE(buf1);
9755 if (release2)
9756 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009757 if (PyUnicode_CheckExact(self)) {
9758 Py_INCREF(self);
9759 return (PyObject *) self;
9760 }
Victor Stinner034f6cf2011-09-30 02:26:44 +02009761 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009762 error:
9763 if (srelease && sbuf)
9764 PyMem_FREE(sbuf);
9765 if (release1 && buf1)
9766 PyMem_FREE(buf1);
9767 if (release2 && buf2)
9768 PyMem_FREE(buf2);
9769 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009770}
9771
9772/* --- Unicode Object Methods --------------------------------------------- */
9773
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009774PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009775 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009776\n\
9777Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009778characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009779
9780static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +02009781unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009782{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009783 return fixup(self, fixtitle);
9784}
9785
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009786PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009787 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009788\n\
9789Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009790have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009791
9792static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +02009793unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009794{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009795 return fixup(self, fixcapitalize);
9796}
9797
9798#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009799PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009800 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009801\n\
9802Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009803normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009804
9805static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009806unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009807{
9808 PyObject *list;
9809 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009810 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009811
Guido van Rossumd57fd912000-03-10 22:53:23 +00009812 /* Split into words */
9813 list = split(self, NULL, -1);
9814 if (!list)
9815 return NULL;
9816
9817 /* Capitalize each word */
9818 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9819 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009820 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009821 if (item == NULL)
9822 goto onError;
9823 Py_DECREF(PyList_GET_ITEM(list, i));
9824 PyList_SET_ITEM(list, i, item);
9825 }
9826
9827 /* Join the words to form a new string */
9828 item = PyUnicode_Join(NULL, list);
9829
Benjamin Peterson29060642009-01-31 22:14:21 +00009830 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009831 Py_DECREF(list);
9832 return (PyObject *)item;
9833}
9834#endif
9835
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009836/* Argument converter. Coerces to a single unicode character */
9837
9838static int
9839convert_uc(PyObject *obj, void *addr)
9840{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009841 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009842 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009843
Benjamin Peterson14339b62009-01-31 16:36:08 +00009844 uniobj = PyUnicode_FromObject(obj);
9845 if (uniobj == NULL) {
9846 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009847 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009848 return 0;
9849 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009850 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009851 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009852 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009853 Py_DECREF(uniobj);
9854 return 0;
9855 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009856 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009857 Py_DECREF(uniobj);
9858 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009859}
9860
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009861PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009862 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009863\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009864Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009865done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009866
9867static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009868unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009869{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009870 Py_ssize_t marg, left;
9871 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009872 Py_UCS4 fillchar = ' ';
9873
Victor Stinnere9a29352011-10-01 02:14:59 +02009874 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009875 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009876
Victor Stinnere9a29352011-10-01 02:14:59 +02009877 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009878 return NULL;
9879
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009880 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009881 Py_INCREF(self);
9882 return (PyObject*) self;
9883 }
9884
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009885 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009886 left = marg / 2 + (marg & width & 1);
9887
Victor Stinner9310abb2011-10-05 00:59:23 +02009888 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009889}
9890
Marc-André Lemburge5034372000-08-08 08:04:29 +00009891#if 0
9892
9893/* This code should go into some future Unicode collation support
9894 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00009895 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00009896
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009897/* speedy UTF-16 code point order comparison */
9898/* gleaned from: */
9899/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9900
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009901static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009902{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009903 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00009904 0, 0, 0, 0, 0, 0, 0, 0,
9905 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009906 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009907};
9908
Guido van Rossumd57fd912000-03-10 22:53:23 +00009909static int
9910unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9911{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009912 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009913
Guido van Rossumd57fd912000-03-10 22:53:23 +00009914 Py_UNICODE *s1 = str1->str;
9915 Py_UNICODE *s2 = str2->str;
9916
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009917 len1 = str1->_base._base.length;
9918 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +00009919
Guido van Rossumd57fd912000-03-10 22:53:23 +00009920 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00009921 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009922
9923 c1 = *s1++;
9924 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00009925
Benjamin Peterson29060642009-01-31 22:14:21 +00009926 if (c1 > (1<<11) * 26)
9927 c1 += utf16Fixup[c1>>11];
9928 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009929 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009930 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00009931
9932 if (c1 != c2)
9933 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00009934
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009935 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009936 }
9937
9938 return (len1 < len2) ? -1 : (len1 != len2);
9939}
9940
Marc-André Lemburge5034372000-08-08 08:04:29 +00009941#else
9942
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009943/* This function assumes that str1 and str2 are readied by the caller. */
9944
Marc-André Lemburge5034372000-08-08 08:04:29 +00009945static int
9946unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9947{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009948 int kind1, kind2;
9949 void *data1, *data2;
9950 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009951
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009952 kind1 = PyUnicode_KIND(str1);
9953 kind2 = PyUnicode_KIND(str2);
9954 data1 = PyUnicode_DATA(str1);
9955 data2 = PyUnicode_DATA(str2);
9956 len1 = PyUnicode_GET_LENGTH(str1);
9957 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +00009958
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009959 for (i = 0; i < len1 && i < len2; ++i) {
9960 Py_UCS4 c1, c2;
9961 c1 = PyUnicode_READ(kind1, data1, i);
9962 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +00009963
9964 if (c1 != c2)
9965 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009966 }
9967
9968 return (len1 < len2) ? -1 : (len1 != len2);
9969}
9970
9971#endif
9972
Alexander Belopolsky40018472011-02-26 01:02:56 +00009973int
9974PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009975{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009976 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9977 if (PyUnicode_READY(left) == -1 ||
9978 PyUnicode_READY(right) == -1)
9979 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009980 return unicode_compare((PyUnicodeObject *)left,
9981 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009982 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009983 PyErr_Format(PyExc_TypeError,
9984 "Can't compare %.100s and %.100s",
9985 left->ob_type->tp_name,
9986 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009987 return -1;
9988}
9989
Martin v. Löwis5b222132007-06-10 09:51:05 +00009990int
9991PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9992{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009993 Py_ssize_t i;
9994 int kind;
9995 void *data;
9996 Py_UCS4 chr;
9997
Victor Stinner910337b2011-10-03 03:20:16 +02009998 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009999 if (PyUnicode_READY(uni) == -1)
10000 return -1;
10001 kind = PyUnicode_KIND(uni);
10002 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010003 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010004 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10005 if (chr != str[i])
10006 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010007 /* This check keeps Python strings that end in '\0' from comparing equal
10008 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010009 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010010 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010011 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010012 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010013 return 0;
10014}
10015
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010016
Benjamin Peterson29060642009-01-31 22:14:21 +000010017#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010018 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010019
Alexander Belopolsky40018472011-02-26 01:02:56 +000010020PyObject *
10021PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010022{
10023 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010024
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010025 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10026 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010027 if (PyUnicode_READY(left) == -1 ||
10028 PyUnicode_READY(right) == -1)
10029 return NULL;
10030 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10031 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010032 if (op == Py_EQ) {
10033 Py_INCREF(Py_False);
10034 return Py_False;
10035 }
10036 if (op == Py_NE) {
10037 Py_INCREF(Py_True);
10038 return Py_True;
10039 }
10040 }
10041 if (left == right)
10042 result = 0;
10043 else
10044 result = unicode_compare((PyUnicodeObject *)left,
10045 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010046
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010047 /* Convert the return value to a Boolean */
10048 switch (op) {
10049 case Py_EQ:
10050 v = TEST_COND(result == 0);
10051 break;
10052 case Py_NE:
10053 v = TEST_COND(result != 0);
10054 break;
10055 case Py_LE:
10056 v = TEST_COND(result <= 0);
10057 break;
10058 case Py_GE:
10059 v = TEST_COND(result >= 0);
10060 break;
10061 case Py_LT:
10062 v = TEST_COND(result == -1);
10063 break;
10064 case Py_GT:
10065 v = TEST_COND(result == 1);
10066 break;
10067 default:
10068 PyErr_BadArgument();
10069 return NULL;
10070 }
10071 Py_INCREF(v);
10072 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010073 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010074
Brian Curtindfc80e32011-08-10 20:28:54 -050010075 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010076}
10077
Alexander Belopolsky40018472011-02-26 01:02:56 +000010078int
10079PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010080{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010081 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010082 int kind1, kind2, kind;
10083 void *buf1, *buf2;
10084 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010085 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010086
10087 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010088 sub = PyUnicode_FromObject(element);
10089 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010090 PyErr_Format(PyExc_TypeError,
10091 "'in <string>' requires string as left operand, not %s",
10092 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010093 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010094 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010095 if (PyUnicode_READY(sub) == -1)
10096 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010097
Thomas Wouters477c8d52006-05-27 19:21:47 +000010098 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +020010099 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010100 Py_DECREF(sub);
10101 return -1;
10102 }
10103
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010104 kind1 = PyUnicode_KIND(str);
10105 kind2 = PyUnicode_KIND(sub);
10106 kind = kind1 > kind2 ? kind1 : kind2;
10107 buf1 = PyUnicode_DATA(str);
10108 buf2 = PyUnicode_DATA(sub);
10109 if (kind1 != kind)
10110 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
10111 if (!buf1) {
10112 Py_DECREF(sub);
10113 return -1;
10114 }
10115 if (kind2 != kind)
10116 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
10117 if (!buf2) {
10118 Py_DECREF(sub);
10119 if (kind1 != kind) PyMem_Free(buf1);
10120 return -1;
10121 }
10122 len1 = PyUnicode_GET_LENGTH(str);
10123 len2 = PyUnicode_GET_LENGTH(sub);
10124
10125 switch(kind) {
10126 case PyUnicode_1BYTE_KIND:
10127 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10128 break;
10129 case PyUnicode_2BYTE_KIND:
10130 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10131 break;
10132 case PyUnicode_4BYTE_KIND:
10133 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10134 break;
10135 default:
10136 result = -1;
10137 assert(0);
10138 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010139
10140 Py_DECREF(str);
10141 Py_DECREF(sub);
10142
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010143 if (kind1 != kind)
10144 PyMem_Free(buf1);
10145 if (kind2 != kind)
10146 PyMem_Free(buf2);
10147
Guido van Rossum403d68b2000-03-13 15:55:09 +000010148 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010149}
10150
Guido van Rossumd57fd912000-03-10 22:53:23 +000010151/* Concat to string or Unicode object giving a new Unicode object. */
10152
Alexander Belopolsky40018472011-02-26 01:02:56 +000010153PyObject *
10154PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010155{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010156 PyObject *u = NULL, *v = NULL, *w;
10157 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010158
10159 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010160 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010161 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010162 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010163 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010164 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010165 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010166
10167 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010168 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010169 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010170 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010171 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010172 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010173 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010174 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010175 }
10176
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010177 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +020010178 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010179
Guido van Rossumd57fd912000-03-10 22:53:23 +000010180 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010181 w = PyUnicode_New(
10182 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
10183 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010184 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010185 goto onError;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010186 copy_characters(w, 0, u, 0, PyUnicode_GET_LENGTH(u));
10187 copy_characters(w, PyUnicode_GET_LENGTH(u), v, 0, PyUnicode_GET_LENGTH(v));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010188 Py_DECREF(u);
10189 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010190 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010191 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010192
Benjamin Peterson29060642009-01-31 22:14:21 +000010193 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010194 Py_XDECREF(u);
10195 Py_XDECREF(v);
10196 return NULL;
10197}
10198
Victor Stinnerb0923652011-10-04 01:17:31 +020010199static void
10200unicode_append_inplace(PyObject **p_left, PyObject *right)
10201{
10202 Py_ssize_t left_len, right_len, new_len;
Victor Stinnerb0923652011-10-04 01:17:31 +020010203
10204 assert(PyUnicode_IS_READY(*p_left));
10205 assert(PyUnicode_IS_READY(right));
10206
10207 left_len = PyUnicode_GET_LENGTH(*p_left);
10208 right_len = PyUnicode_GET_LENGTH(right);
10209 if (left_len > PY_SSIZE_T_MAX - right_len) {
10210 PyErr_SetString(PyExc_OverflowError,
10211 "strings are too large to concat");
10212 goto error;
10213 }
10214 new_len = left_len + right_len;
10215
10216 /* Now we own the last reference to 'left', so we can resize it
10217 * in-place.
10218 */
10219 if (unicode_resize(p_left, new_len) != 0) {
10220 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10221 * deallocated so it cannot be put back into
10222 * 'variable'. The MemoryError is raised when there
10223 * is no value in 'variable', which might (very
10224 * remotely) be a cause of incompatibilities.
10225 */
10226 goto error;
10227 }
10228 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010229 copy_characters(*p_left, left_len, right, 0, right_len);
10230 _PyUnicode_DIRTY(*p_left);
Victor Stinnerb0923652011-10-04 01:17:31 +020010231 return;
10232
10233error:
10234 Py_DECREF(*p_left);
10235 *p_left = NULL;
10236}
10237
Walter Dörwald1ab83302007-05-18 17:15:44 +000010238void
Victor Stinner23e56682011-10-03 03:54:37 +020010239PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010240{
Victor Stinner23e56682011-10-03 03:54:37 +020010241 PyObject *left, *res;
10242
10243 if (p_left == NULL) {
10244 if (!PyErr_Occurred())
10245 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010246 return;
10247 }
Victor Stinner23e56682011-10-03 03:54:37 +020010248 left = *p_left;
10249 if (right == NULL || !PyUnicode_Check(left)) {
10250 if (!PyErr_Occurred())
10251 PyErr_BadInternalCall();
10252 goto error;
10253 }
10254
Victor Stinnere1335c72011-10-04 20:53:03 +020010255 if (PyUnicode_READY(left))
10256 goto error;
10257 if (PyUnicode_READY(right))
10258 goto error;
10259
Victor Stinner23e56682011-10-03 03:54:37 +020010260 if (PyUnicode_CheckExact(left) && left != unicode_empty
10261 && PyUnicode_CheckExact(right) && right != unicode_empty
10262 && unicode_resizable(left)
10263 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
10264 || _PyUnicode_WSTR(left) != NULL))
10265 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010266 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10267 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010268 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010269 not so different than duplicating the string. */
10270 if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
Victor Stinner23e56682011-10-03 03:54:37 +020010271 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010272 unicode_append_inplace(p_left, right);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010273 if (p_left != NULL)
10274 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010275 return;
10276 }
10277 }
10278
10279 res = PyUnicode_Concat(left, right);
10280 if (res == NULL)
10281 goto error;
10282 Py_DECREF(left);
10283 *p_left = res;
10284 return;
10285
10286error:
10287 Py_DECREF(*p_left);
10288 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +000010289}
10290
10291void
10292PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10293{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010294 PyUnicode_Append(pleft, right);
10295 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010296}
10297
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010298PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010299 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010300\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010301Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010302string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010303interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010304
10305static PyObject *
10306unicode_count(PyUnicodeObject *self, PyObject *args)
10307{
10308 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010309 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010310 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010311 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010312 int kind1, kind2, kind;
10313 void *buf1, *buf2;
10314 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010315
Jesus Ceaac451502011-04-20 17:09:23 +020010316 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10317 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010318 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010319
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010320 kind1 = PyUnicode_KIND(self);
10321 kind2 = PyUnicode_KIND(substring);
10322 kind = kind1 > kind2 ? kind1 : kind2;
10323 buf1 = PyUnicode_DATA(self);
10324 buf2 = PyUnicode_DATA(substring);
10325 if (kind1 != kind)
10326 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
10327 if (!buf1) {
10328 Py_DECREF(substring);
10329 return NULL;
10330 }
10331 if (kind2 != kind)
10332 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
10333 if (!buf2) {
10334 Py_DECREF(substring);
10335 if (kind1 != kind) PyMem_Free(buf1);
10336 return NULL;
10337 }
10338 len1 = PyUnicode_GET_LENGTH(self);
10339 len2 = PyUnicode_GET_LENGTH(substring);
10340
10341 ADJUST_INDICES(start, end, len1);
10342 switch(kind) {
10343 case PyUnicode_1BYTE_KIND:
10344 iresult = ucs1lib_count(
10345 ((Py_UCS1*)buf1) + start, end - start,
10346 buf2, len2, PY_SSIZE_T_MAX
10347 );
10348 break;
10349 case PyUnicode_2BYTE_KIND:
10350 iresult = ucs2lib_count(
10351 ((Py_UCS2*)buf1) + start, end - start,
10352 buf2, len2, PY_SSIZE_T_MAX
10353 );
10354 break;
10355 case PyUnicode_4BYTE_KIND:
10356 iresult = ucs4lib_count(
10357 ((Py_UCS4*)buf1) + start, end - start,
10358 buf2, len2, PY_SSIZE_T_MAX
10359 );
10360 break;
10361 default:
10362 assert(0); iresult = 0;
10363 }
10364
10365 result = PyLong_FromSsize_t(iresult);
10366
10367 if (kind1 != kind)
10368 PyMem_Free(buf1);
10369 if (kind2 != kind)
10370 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010371
10372 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010373
Guido van Rossumd57fd912000-03-10 22:53:23 +000010374 return result;
10375}
10376
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010377PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010378 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010379\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010380Encode S using the codec registered for encoding. Default encoding\n\
10381is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010382handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010383a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10384'xmlcharrefreplace' as well as any other name registered with\n\
10385codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010386
10387static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +000010388unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010389{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010390 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010391 char *encoding = NULL;
10392 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010393
Benjamin Peterson308d6372009-09-18 21:42:35 +000010394 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10395 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010396 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +000010397 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010398}
10399
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010400PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010401 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010402\n\
10403Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010404If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010405
10406static PyObject*
10407unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
10408{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010409 Py_ssize_t i, j, line_pos, src_len, incr;
10410 Py_UCS4 ch;
10411 PyObject *u;
10412 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010413 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010414 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010415 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010416
10417 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010418 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010419
Antoine Pitrou22425222011-10-04 19:10:51 +020010420 if (PyUnicode_READY(self) == -1)
10421 return NULL;
10422
Thomas Wouters7e474022000-07-16 12:04:32 +000010423 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010424 src_len = PyUnicode_GET_LENGTH(self);
10425 i = j = line_pos = 0;
10426 kind = PyUnicode_KIND(self);
10427 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010428 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010429 for (; i < src_len; i++) {
10430 ch = PyUnicode_READ(kind, src_data, i);
10431 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010432 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010433 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010434 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010435 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010436 goto overflow;
10437 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010438 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010439 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010440 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010441 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010442 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010443 goto overflow;
10444 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010445 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010446 if (ch == '\n' || ch == '\r')
10447 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010448 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010449 }
Antoine Pitroue19aa382011-10-04 16:04:01 +020010450 if (!found && PyUnicode_CheckExact(self)) {
10451 Py_INCREF((PyObject *) self);
10452 return (PyObject *) self;
10453 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010454
Guido van Rossumd57fd912000-03-10 22:53:23 +000010455 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010456 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010457 if (!u)
10458 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010459 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010460
Antoine Pitroue71d5742011-10-04 15:55:09 +020010461 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010462
Antoine Pitroue71d5742011-10-04 15:55:09 +020010463 for (; i < src_len; i++) {
10464 ch = PyUnicode_READ(kind, src_data, i);
10465 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010466 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010467 incr = tabsize - (line_pos % tabsize);
10468 line_pos += incr;
10469 while (incr--) {
10470 PyUnicode_WRITE(kind, dest_data, j, ' ');
10471 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010472 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010473 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010474 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010475 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010476 line_pos++;
10477 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010478 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010479 if (ch == '\n' || ch == '\r')
10480 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010481 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010482 }
10483 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinner17efeed2011-10-04 20:05:46 +020010484#ifndef DONT_MAKE_RESULT_READY
10485 if (_PyUnicode_READY_REPLACE(&u)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010486 Py_DECREF(u);
10487 return NULL;
10488 }
Victor Stinner17efeed2011-10-04 20:05:46 +020010489#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010490 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010491 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010492
Antoine Pitroue71d5742011-10-04 15:55:09 +020010493 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010494 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10495 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010496}
10497
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010498PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010499 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010500\n\
10501Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010502such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010503arguments start and end are interpreted as in slice notation.\n\
10504\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010505Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010506
10507static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010508unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010509{
Jesus Ceaac451502011-04-20 17:09:23 +020010510 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010511 Py_ssize_t start;
10512 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010513 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010514
Jesus Ceaac451502011-04-20 17:09:23 +020010515 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10516 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010517 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010518
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010519 if (PyUnicode_READY(self) == -1)
10520 return NULL;
10521 if (PyUnicode_READY(substring) == -1)
10522 return NULL;
10523
10524 result = any_find_slice(
Victor Stinnerc3cec782011-10-05 21:24:08 +020010525 asciilib_find_slice, ucs1lib_find_slice,
10526 ucs2lib_find_slice, ucs4lib_find_slice,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010527 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010528 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010529
10530 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010531
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010532 if (result == -2)
10533 return NULL;
10534
Christian Heimes217cfd12007-12-02 14:31:20 +000010535 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010536}
10537
10538static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010539unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010540{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010541 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10542 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010543 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010544 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010545}
10546
Guido van Rossumc2504932007-09-18 19:42:40 +000010547/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010548 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010549static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +000010550unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010551{
Guido van Rossumc2504932007-09-18 19:42:40 +000010552 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010553 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010554
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010555 if (_PyUnicode_HASH(self) != -1)
10556 return _PyUnicode_HASH(self);
10557 if (PyUnicode_READY(self) == -1)
10558 return -1;
10559 len = PyUnicode_GET_LENGTH(self);
10560
10561 /* The hash function as a macro, gets expanded three times below. */
10562#define HASH(P) \
10563 x = (Py_uhash_t)*P << 7; \
10564 while (--len >= 0) \
10565 x = (1000003*x) ^ (Py_uhash_t)*P++;
10566
10567 switch (PyUnicode_KIND(self)) {
10568 case PyUnicode_1BYTE_KIND: {
10569 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10570 HASH(c);
10571 break;
10572 }
10573 case PyUnicode_2BYTE_KIND: {
10574 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10575 HASH(s);
10576 break;
10577 }
10578 default: {
10579 Py_UCS4 *l;
10580 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10581 "Impossible switch case in unicode_hash");
10582 l = PyUnicode_4BYTE_DATA(self);
10583 HASH(l);
10584 break;
10585 }
10586 }
10587 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
10588
Guido van Rossumc2504932007-09-18 19:42:40 +000010589 if (x == -1)
10590 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010591 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010592 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010593}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010594#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000010595
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010596PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010597 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010598\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010599Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010600
10601static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010602unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010603{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010604 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +020010605 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010606 Py_ssize_t start;
10607 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010608
Jesus Ceaac451502011-04-20 17:09:23 +020010609 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10610 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010611 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010612
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010613 if (PyUnicode_READY(self) == -1)
10614 return NULL;
10615 if (PyUnicode_READY(substring) == -1)
10616 return NULL;
10617
10618 result = any_find_slice(
Victor Stinnerc3cec782011-10-05 21:24:08 +020010619 asciilib_find_slice, ucs1lib_find_slice,
10620 ucs2lib_find_slice, ucs4lib_find_slice,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010621 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010622 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010623
10624 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010625
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010626 if (result == -2)
10627 return NULL;
10628
Guido van Rossumd57fd912000-03-10 22:53:23 +000010629 if (result < 0) {
10630 PyErr_SetString(PyExc_ValueError, "substring not found");
10631 return NULL;
10632 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010633
Christian Heimes217cfd12007-12-02 14:31:20 +000010634 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010635}
10636
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010637PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010638 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010639\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010640Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010641at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010642
10643static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010644unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010645{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010646 Py_ssize_t i, length;
10647 int kind;
10648 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010649 int cased;
10650
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010651 if (PyUnicode_READY(self) == -1)
10652 return NULL;
10653 length = PyUnicode_GET_LENGTH(self);
10654 kind = PyUnicode_KIND(self);
10655 data = PyUnicode_DATA(self);
10656
Guido van Rossumd57fd912000-03-10 22:53:23 +000010657 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010658 if (length == 1)
10659 return PyBool_FromLong(
10660 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010661
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010662 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010663 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010664 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010665
Guido van Rossumd57fd912000-03-10 22:53:23 +000010666 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010667 for (i = 0; i < length; i++) {
10668 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010669
Benjamin Peterson29060642009-01-31 22:14:21 +000010670 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
10671 return PyBool_FromLong(0);
10672 else if (!cased && Py_UNICODE_ISLOWER(ch))
10673 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010674 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010675 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010676}
10677
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010678PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010679 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010680\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010681Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010682at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010683
10684static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010685unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010686{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010687 Py_ssize_t i, length;
10688 int kind;
10689 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010690 int cased;
10691
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010692 if (PyUnicode_READY(self) == -1)
10693 return NULL;
10694 length = PyUnicode_GET_LENGTH(self);
10695 kind = PyUnicode_KIND(self);
10696 data = PyUnicode_DATA(self);
10697
Guido van Rossumd57fd912000-03-10 22:53:23 +000010698 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010699 if (length == 1)
10700 return PyBool_FromLong(
10701 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010702
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010703 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010704 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010705 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010706
Guido van Rossumd57fd912000-03-10 22:53:23 +000010707 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010708 for (i = 0; i < length; i++) {
10709 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010710
Benjamin Peterson29060642009-01-31 22:14:21 +000010711 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
10712 return PyBool_FromLong(0);
10713 else if (!cased && Py_UNICODE_ISUPPER(ch))
10714 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010715 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010716 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010717}
10718
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010719PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010720 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010721\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010722Return True if S is a titlecased string and there is at least one\n\
10723character in S, i.e. upper- and titlecase characters may only\n\
10724follow uncased characters and lowercase characters only cased ones.\n\
10725Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010726
10727static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010728unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010729{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010730 Py_ssize_t i, length;
10731 int kind;
10732 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010733 int cased, previous_is_cased;
10734
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010735 if (PyUnicode_READY(self) == -1)
10736 return NULL;
10737 length = PyUnicode_GET_LENGTH(self);
10738 kind = PyUnicode_KIND(self);
10739 data = PyUnicode_DATA(self);
10740
Guido van Rossumd57fd912000-03-10 22:53:23 +000010741 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010742 if (length == 1) {
10743 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10744 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10745 (Py_UNICODE_ISUPPER(ch) != 0));
10746 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010747
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010748 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010749 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010750 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010751
Guido van Rossumd57fd912000-03-10 22:53:23 +000010752 cased = 0;
10753 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010754 for (i = 0; i < length; i++) {
10755 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010756
Benjamin Peterson29060642009-01-31 22:14:21 +000010757 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10758 if (previous_is_cased)
10759 return PyBool_FromLong(0);
10760 previous_is_cased = 1;
10761 cased = 1;
10762 }
10763 else if (Py_UNICODE_ISLOWER(ch)) {
10764 if (!previous_is_cased)
10765 return PyBool_FromLong(0);
10766 previous_is_cased = 1;
10767 cased = 1;
10768 }
10769 else
10770 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010771 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010772 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010773}
10774
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010775PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010776 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010777\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010778Return True if all characters in S are whitespace\n\
10779and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010780
10781static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010782unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010783{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010784 Py_ssize_t i, length;
10785 int kind;
10786 void *data;
10787
10788 if (PyUnicode_READY(self) == -1)
10789 return NULL;
10790 length = PyUnicode_GET_LENGTH(self);
10791 kind = PyUnicode_KIND(self);
10792 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010793
Guido van Rossumd57fd912000-03-10 22:53:23 +000010794 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010795 if (length == 1)
10796 return PyBool_FromLong(
10797 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010798
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010799 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010800 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010801 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010802
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010803 for (i = 0; i < length; i++) {
10804 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010805 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010806 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010807 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010808 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010809}
10810
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010811PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010812 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010813\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010814Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010815and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010816
10817static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010818unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010819{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010820 Py_ssize_t i, length;
10821 int kind;
10822 void *data;
10823
10824 if (PyUnicode_READY(self) == -1)
10825 return NULL;
10826 length = PyUnicode_GET_LENGTH(self);
10827 kind = PyUnicode_KIND(self);
10828 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010829
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010830 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010831 if (length == 1)
10832 return PyBool_FromLong(
10833 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010834
10835 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010836 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010837 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010838
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010839 for (i = 0; i < length; i++) {
10840 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010841 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010842 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010843 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010844}
10845
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010846PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010847 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010848\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010849Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010850and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010851
10852static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010853unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010854{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010855 int kind;
10856 void *data;
10857 Py_ssize_t len, i;
10858
10859 if (PyUnicode_READY(self) == -1)
10860 return NULL;
10861
10862 kind = PyUnicode_KIND(self);
10863 data = PyUnicode_DATA(self);
10864 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010865
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010866 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010867 if (len == 1) {
10868 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10869 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10870 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010871
10872 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010873 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010874 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010875
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010876 for (i = 0; i < len; i++) {
10877 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010878 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010879 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010880 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010881 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010882}
10883
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010884PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010885 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010886\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010887Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010888False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010889
10890static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010891unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010892{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010893 Py_ssize_t i, length;
10894 int kind;
10895 void *data;
10896
10897 if (PyUnicode_READY(self) == -1)
10898 return NULL;
10899 length = PyUnicode_GET_LENGTH(self);
10900 kind = PyUnicode_KIND(self);
10901 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010902
Guido van Rossumd57fd912000-03-10 22:53:23 +000010903 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010904 if (length == 1)
10905 return PyBool_FromLong(
10906 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010907
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010908 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010909 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010910 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010911
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010912 for (i = 0; i < length; i++) {
10913 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010914 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010915 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010916 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010917}
10918
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010919PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010920 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010921\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010922Return True if all characters in S are digits\n\
10923and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010924
10925static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010926unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010927{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010928 Py_ssize_t i, length;
10929 int kind;
10930 void *data;
10931
10932 if (PyUnicode_READY(self) == -1)
10933 return NULL;
10934 length = PyUnicode_GET_LENGTH(self);
10935 kind = PyUnicode_KIND(self);
10936 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010937
Guido van Rossumd57fd912000-03-10 22:53:23 +000010938 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010939 if (length == 1) {
10940 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10941 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10942 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010943
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010944 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010945 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010946 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010947
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010948 for (i = 0; i < length; i++) {
10949 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010950 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010951 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010952 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010953}
10954
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010955PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010956 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010957\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010958Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010959False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010960
10961static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010962unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010963{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010964 Py_ssize_t i, length;
10965 int kind;
10966 void *data;
10967
10968 if (PyUnicode_READY(self) == -1)
10969 return NULL;
10970 length = PyUnicode_GET_LENGTH(self);
10971 kind = PyUnicode_KIND(self);
10972 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010973
Guido van Rossumd57fd912000-03-10 22:53:23 +000010974 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010975 if (length == 1)
10976 return PyBool_FromLong(
10977 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010978
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010979 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010980 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010981 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010982
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010983 for (i = 0; i < length; i++) {
10984 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010985 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010986 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010987 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010988}
10989
Martin v. Löwis47383402007-08-15 07:32:56 +000010990int
10991PyUnicode_IsIdentifier(PyObject *self)
10992{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010993 int kind;
10994 void *data;
10995 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010996 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000010997
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010998 if (PyUnicode_READY(self) == -1) {
10999 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011000 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011001 }
11002
11003 /* Special case for empty strings */
11004 if (PyUnicode_GET_LENGTH(self) == 0)
11005 return 0;
11006 kind = PyUnicode_KIND(self);
11007 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011008
11009 /* PEP 3131 says that the first character must be in
11010 XID_Start and subsequent characters in XID_Continue,
11011 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011012 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011013 letters, digits, underscore). However, given the current
11014 definition of XID_Start and XID_Continue, it is sufficient
11015 to check just for these, except that _ must be allowed
11016 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011017 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011018 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011019 return 0;
11020
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011021 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011022 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011023 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011024 return 1;
11025}
11026
11027PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011028 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011029\n\
11030Return True if S is a valid identifier according\n\
11031to the language definition.");
11032
11033static PyObject*
11034unicode_isidentifier(PyObject *self)
11035{
11036 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11037}
11038
Georg Brandl559e5d72008-06-11 18:37:52 +000011039PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011040 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011041\n\
11042Return True if all characters in S are considered\n\
11043printable in repr() or S is empty, False otherwise.");
11044
11045static PyObject*
11046unicode_isprintable(PyObject *self)
11047{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011048 Py_ssize_t i, length;
11049 int kind;
11050 void *data;
11051
11052 if (PyUnicode_READY(self) == -1)
11053 return NULL;
11054 length = PyUnicode_GET_LENGTH(self);
11055 kind = PyUnicode_KIND(self);
11056 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011057
11058 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011059 if (length == 1)
11060 return PyBool_FromLong(
11061 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011062
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011063 for (i = 0; i < length; i++) {
11064 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011065 Py_RETURN_FALSE;
11066 }
11067 }
11068 Py_RETURN_TRUE;
11069}
11070
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011071PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011072 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011073\n\
11074Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011075iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011076
11077static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011078unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011079{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011080 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011081}
11082
Martin v. Löwis18e16552006-02-15 17:27:45 +000011083static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000011084unicode_length(PyUnicodeObject *self)
11085{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011086 if (PyUnicode_READY(self) == -1)
11087 return -1;
11088 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011089}
11090
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011091PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011092 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011093\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011094Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011095done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011096
11097static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011098unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011099{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011100 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011101 Py_UCS4 fillchar = ' ';
11102
11103 if (PyUnicode_READY(self) == -1)
11104 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011105
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011106 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011107 return NULL;
11108
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011109 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011110 Py_INCREF(self);
11111 return (PyObject*) self;
11112 }
11113
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011114 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011115}
11116
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011117PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011118 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011119\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011120Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011121
11122static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011123unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011124{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011125 return fixup(self, fixlower);
11126}
11127
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011128#define LEFTSTRIP 0
11129#define RIGHTSTRIP 1
11130#define BOTHSTRIP 2
11131
11132/* Arrays indexed by above */
11133static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11134
11135#define STRIPNAME(i) (stripformat[i]+3)
11136
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011137/* externally visible for str.strip(unicode) */
11138PyObject *
11139_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
11140{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011141 void *data;
11142 int kind;
11143 Py_ssize_t i, j, len;
11144 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011145
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011146 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11147 return NULL;
11148
11149 kind = PyUnicode_KIND(self);
11150 data = PyUnicode_DATA(self);
11151 len = PyUnicode_GET_LENGTH(self);
11152 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11153 PyUnicode_DATA(sepobj),
11154 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011155
Benjamin Peterson14339b62009-01-31 16:36:08 +000011156 i = 0;
11157 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011158 while (i < len &&
11159 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011160 i++;
11161 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011162 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011163
Benjamin Peterson14339b62009-01-31 16:36:08 +000011164 j = len;
11165 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011166 do {
11167 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011168 } while (j >= i &&
11169 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011170 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011171 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011172
Victor Stinner12bab6d2011-10-01 01:53:49 +020011173 return PyUnicode_Substring((PyObject*)self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011174}
11175
11176PyObject*
11177PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11178{
11179 unsigned char *data;
11180 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011181 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011182
Victor Stinnerde636f32011-10-01 03:55:54 +020011183 if (PyUnicode_READY(self) == -1)
11184 return NULL;
11185
11186 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11187
Victor Stinner12bab6d2011-10-01 01:53:49 +020011188 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011189 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011190 if (PyUnicode_CheckExact(self)) {
11191 Py_INCREF(self);
11192 return self;
11193 }
11194 else
11195 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011196 }
11197
Victor Stinner12bab6d2011-10-01 01:53:49 +020011198 length = end - start;
11199 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011200 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011201
Victor Stinnerde636f32011-10-01 03:55:54 +020011202 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011203 PyErr_SetString(PyExc_IndexError, "string index out of range");
11204 return NULL;
11205 }
11206
Victor Stinnerb9275c12011-10-05 14:01:42 +020011207 if (PyUnicode_IS_ASCII(self)) {
11208 kind = PyUnicode_KIND(self);
11209 data = PyUnicode_1BYTE_DATA(self);
11210 return unicode_fromascii(data + start, length);
11211 }
11212 else {
11213 kind = PyUnicode_KIND(self);
11214 data = PyUnicode_1BYTE_DATA(self);
11215 return PyUnicode_FromKindAndData(kind,
11216 data + PyUnicode_KIND_SIZE(kind, start),
11217 length);
11218 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011219}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011220
11221static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011222do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011223{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011224 int kind;
11225 void *data;
11226 Py_ssize_t len, i, j;
11227
11228 if (PyUnicode_READY(self) == -1)
11229 return NULL;
11230
11231 kind = PyUnicode_KIND(self);
11232 data = PyUnicode_DATA(self);
11233 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011234
Benjamin Peterson14339b62009-01-31 16:36:08 +000011235 i = 0;
11236 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011237 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011238 i++;
11239 }
11240 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011241
Benjamin Peterson14339b62009-01-31 16:36:08 +000011242 j = len;
11243 if (striptype != LEFTSTRIP) {
11244 do {
11245 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011246 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011247 j++;
11248 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011249
Victor Stinner12bab6d2011-10-01 01:53:49 +020011250 return PyUnicode_Substring((PyObject*)self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011251}
11252
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011253
11254static PyObject *
11255do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
11256{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011257 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011258
Benjamin Peterson14339b62009-01-31 16:36:08 +000011259 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11260 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011261
Benjamin Peterson14339b62009-01-31 16:36:08 +000011262 if (sep != NULL && sep != Py_None) {
11263 if (PyUnicode_Check(sep))
11264 return _PyUnicode_XStrip(self, striptype, sep);
11265 else {
11266 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011267 "%s arg must be None or str",
11268 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011269 return NULL;
11270 }
11271 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011272
Benjamin Peterson14339b62009-01-31 16:36:08 +000011273 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011274}
11275
11276
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011277PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011278 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011279\n\
11280Return a copy of the string S with leading and trailing\n\
11281whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011282If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011283
11284static PyObject *
11285unicode_strip(PyUnicodeObject *self, PyObject *args)
11286{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011287 if (PyTuple_GET_SIZE(args) == 0)
11288 return do_strip(self, BOTHSTRIP); /* Common case */
11289 else
11290 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011291}
11292
11293
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011294PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011295 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011296\n\
11297Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011298If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011299
11300static PyObject *
11301unicode_lstrip(PyUnicodeObject *self, PyObject *args)
11302{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011303 if (PyTuple_GET_SIZE(args) == 0)
11304 return do_strip(self, LEFTSTRIP); /* Common case */
11305 else
11306 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011307}
11308
11309
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011310PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011311 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011312\n\
11313Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011314If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011315
11316static PyObject *
11317unicode_rstrip(PyUnicodeObject *self, PyObject *args)
11318{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011319 if (PyTuple_GET_SIZE(args) == 0)
11320 return do_strip(self, RIGHTSTRIP); /* Common case */
11321 else
11322 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011323}
11324
11325
Guido van Rossumd57fd912000-03-10 22:53:23 +000011326static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000011327unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011328{
11329 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011330 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011331
Georg Brandl222de0f2009-04-12 12:01:50 +000011332 if (len < 1) {
11333 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011334 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011335 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011336
Tim Peters7a29bd52001-09-12 03:03:31 +000011337 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011338 /* no repeat, return original string */
11339 Py_INCREF(str);
11340 return (PyObject*) str;
11341 }
Tim Peters8f422462000-09-09 06:13:41 +000011342
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011343 if (PyUnicode_READY(str) == -1)
11344 return NULL;
11345
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011346 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011347 PyErr_SetString(PyExc_OverflowError,
11348 "repeated string is too long");
11349 return NULL;
11350 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011351 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011352
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011353 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011354 if (!u)
11355 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011356 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011357
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011358 if (PyUnicode_GET_LENGTH(str) == 1) {
11359 const int kind = PyUnicode_KIND(str);
11360 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11361 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011362 if (kind == PyUnicode_1BYTE_KIND)
11363 memset(to, (unsigned char)fill_char, len);
11364 else {
11365 for (n = 0; n < len; ++n)
11366 PyUnicode_WRITE(kind, to, n, fill_char);
11367 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011368 }
11369 else {
11370 /* number of characters copied this far */
11371 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
11372 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
11373 char *to = (char *) PyUnicode_DATA(u);
11374 Py_MEMCPY(to, PyUnicode_DATA(str),
11375 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011376 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011377 n = (done <= nchars-done) ? done : nchars-done;
11378 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011379 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011380 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011381 }
11382
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011383 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011384 return (PyObject*) u;
11385}
11386
Alexander Belopolsky40018472011-02-26 01:02:56 +000011387PyObject *
11388PyUnicode_Replace(PyObject *obj,
11389 PyObject *subobj,
11390 PyObject *replobj,
11391 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011392{
11393 PyObject *self;
11394 PyObject *str1;
11395 PyObject *str2;
11396 PyObject *result;
11397
11398 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011399 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011400 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011401 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011402 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011403 Py_DECREF(self);
11404 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011405 }
11406 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011407 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011408 Py_DECREF(self);
11409 Py_DECREF(str1);
11410 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011411 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011412 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011413 Py_DECREF(self);
11414 Py_DECREF(str1);
11415 Py_DECREF(str2);
11416 return result;
11417}
11418
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011419PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011420 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011421\n\
11422Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011423old replaced by new. If the optional argument count is\n\
11424given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011425
11426static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011427unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011428{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011429 PyObject *str1;
11430 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011431 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011432 PyObject *result;
11433
Martin v. Löwis18e16552006-02-15 17:27:45 +000011434 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011435 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011436 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011437 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011438 str1 = PyUnicode_FromObject(str1);
11439 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11440 return NULL;
11441 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011442 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011443 Py_DECREF(str1);
11444 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011445 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011446
11447 result = replace(self, str1, str2, maxcount);
11448
11449 Py_DECREF(str1);
11450 Py_DECREF(str2);
11451 return result;
11452}
11453
Alexander Belopolsky40018472011-02-26 01:02:56 +000011454static PyObject *
11455unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011456{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011457 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011458 Py_ssize_t isize;
11459 Py_ssize_t osize, squote, dquote, i, o;
11460 Py_UCS4 max, quote;
11461 int ikind, okind;
11462 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011463
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011464 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011465 return NULL;
11466
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011467 isize = PyUnicode_GET_LENGTH(unicode);
11468 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011469
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011470 /* Compute length of output, quote characters, and
11471 maximum character */
11472 osize = 2; /* quotes */
11473 max = 127;
11474 squote = dquote = 0;
11475 ikind = PyUnicode_KIND(unicode);
11476 for (i = 0; i < isize; i++) {
11477 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11478 switch (ch) {
11479 case '\'': squote++; osize++; break;
11480 case '"': dquote++; osize++; break;
11481 case '\\': case '\t': case '\r': case '\n':
11482 osize += 2; break;
11483 default:
11484 /* Fast-path ASCII */
11485 if (ch < ' ' || ch == 0x7f)
11486 osize += 4; /* \xHH */
11487 else if (ch < 0x7f)
11488 osize++;
11489 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11490 osize++;
11491 max = ch > max ? ch : max;
11492 }
11493 else if (ch < 0x100)
11494 osize += 4; /* \xHH */
11495 else if (ch < 0x10000)
11496 osize += 6; /* \uHHHH */
11497 else
11498 osize += 10; /* \uHHHHHHHH */
11499 }
11500 }
11501
11502 quote = '\'';
11503 if (squote) {
11504 if (dquote)
11505 /* Both squote and dquote present. Use squote,
11506 and escape them */
11507 osize += squote;
11508 else
11509 quote = '"';
11510 }
11511
11512 repr = PyUnicode_New(osize, max);
11513 if (repr == NULL)
11514 return NULL;
11515 okind = PyUnicode_KIND(repr);
11516 odata = PyUnicode_DATA(repr);
11517
11518 PyUnicode_WRITE(okind, odata, 0, quote);
11519 PyUnicode_WRITE(okind, odata, osize-1, quote);
11520
11521 for (i = 0, o = 1; i < isize; i++) {
11522 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011523
11524 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011525 if ((ch == quote) || (ch == '\\')) {
11526 PyUnicode_WRITE(okind, odata, o++, '\\');
11527 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011528 continue;
11529 }
11530
Benjamin Peterson29060642009-01-31 22:14:21 +000011531 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011532 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011533 PyUnicode_WRITE(okind, odata, o++, '\\');
11534 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011535 }
11536 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011537 PyUnicode_WRITE(okind, odata, o++, '\\');
11538 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011539 }
11540 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011541 PyUnicode_WRITE(okind, odata, o++, '\\');
11542 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011543 }
11544
11545 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011546 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011547 PyUnicode_WRITE(okind, odata, o++, '\\');
11548 PyUnicode_WRITE(okind, odata, o++, 'x');
11549 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11550 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011551 }
11552
Georg Brandl559e5d72008-06-11 18:37:52 +000011553 /* Copy ASCII characters as-is */
11554 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011555 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011556 }
11557
Benjamin Peterson29060642009-01-31 22:14:21 +000011558 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011559 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011560 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011561 (categories Z* and C* except ASCII space)
11562 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011563 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011564 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011565 if (ch <= 0xff) {
11566 PyUnicode_WRITE(okind, odata, o++, '\\');
11567 PyUnicode_WRITE(okind, odata, o++, 'x');
11568 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11569 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011570 }
11571 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011572 else if (ch >= 0x10000) {
11573 PyUnicode_WRITE(okind, odata, o++, '\\');
11574 PyUnicode_WRITE(okind, odata, o++, 'U');
11575 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
11576 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
11577 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
11578 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
11579 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11580 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11581 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11582 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011583 }
11584 /* Map 16-bit characters to '\uxxxx' */
11585 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011586 PyUnicode_WRITE(okind, odata, o++, '\\');
11587 PyUnicode_WRITE(okind, odata, o++, 'u');
11588 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11589 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11590 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11591 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011592 }
11593 }
11594 /* Copy characters as-is */
11595 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011596 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011597 }
11598 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000011599 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011600 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020011601 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000011602 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011603}
11604
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011605PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011606 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011607\n\
11608Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011609such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011610arguments start and end are interpreted as in slice notation.\n\
11611\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011612Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011613
11614static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011615unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011616{
Jesus Ceaac451502011-04-20 17:09:23 +020011617 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011618 Py_ssize_t start;
11619 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011620 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011621
Jesus Ceaac451502011-04-20 17:09:23 +020011622 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11623 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011624 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011625
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011626 if (PyUnicode_READY(self) == -1)
11627 return NULL;
11628 if (PyUnicode_READY(substring) == -1)
11629 return NULL;
11630
11631 result = any_find_slice(
Victor Stinnerc3cec782011-10-05 21:24:08 +020011632 asciilib_rfind_slice, ucs1lib_rfind_slice,
11633 ucs2lib_rfind_slice, ucs4lib_rfind_slice,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011634 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011635 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011636
11637 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011638
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011639 if (result == -2)
11640 return NULL;
11641
Christian Heimes217cfd12007-12-02 14:31:20 +000011642 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011643}
11644
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011645PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011646 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011647\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011648Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011649
11650static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011651unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011652{
Jesus Ceaac451502011-04-20 17:09:23 +020011653 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011654 Py_ssize_t start;
11655 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011656 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011657
Jesus Ceaac451502011-04-20 17:09:23 +020011658 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
11659 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011660 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011661
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011662 if (PyUnicode_READY(self) == -1)
11663 return NULL;
11664 if (PyUnicode_READY(substring) == -1)
11665 return NULL;
11666
11667 result = any_find_slice(
Victor Stinnerc3cec782011-10-05 21:24:08 +020011668 asciilib_rfind_slice, ucs1lib_rfind_slice,
11669 ucs2lib_rfind_slice, ucs4lib_rfind_slice,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011670 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011671 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011672
11673 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011674
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011675 if (result == -2)
11676 return NULL;
11677
Guido van Rossumd57fd912000-03-10 22:53:23 +000011678 if (result < 0) {
11679 PyErr_SetString(PyExc_ValueError, "substring not found");
11680 return NULL;
11681 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011682
Christian Heimes217cfd12007-12-02 14:31:20 +000011683 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011684}
11685
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011686PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011687 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011688\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011689Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011690done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011691
11692static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011693unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011694{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011695 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011696 Py_UCS4 fillchar = ' ';
11697
Victor Stinnere9a29352011-10-01 02:14:59 +020011698 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011699 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011700
Victor Stinnere9a29352011-10-01 02:14:59 +020011701 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011702 return NULL;
11703
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011704 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011705 Py_INCREF(self);
11706 return (PyObject*) self;
11707 }
11708
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011709 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011710}
11711
Alexander Belopolsky40018472011-02-26 01:02:56 +000011712PyObject *
11713PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011714{
11715 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000011716
Guido van Rossumd57fd912000-03-10 22:53:23 +000011717 s = PyUnicode_FromObject(s);
11718 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011719 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011720 if (sep != NULL) {
11721 sep = PyUnicode_FromObject(sep);
11722 if (sep == NULL) {
11723 Py_DECREF(s);
11724 return NULL;
11725 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011726 }
11727
Victor Stinner9310abb2011-10-05 00:59:23 +020011728 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011729
11730 Py_DECREF(s);
11731 Py_XDECREF(sep);
11732 return result;
11733}
11734
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011735PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011736 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011737\n\
11738Return a list of the words in S, using sep as the\n\
11739delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000011740splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000011741whitespace string is a separator and empty strings are\n\
11742removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011743
11744static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011745unicode_split(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011746{
11747 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011748 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011749
Martin v. Löwis18e16552006-02-15 17:27:45 +000011750 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011751 return NULL;
11752
11753 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011754 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011755 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020011756 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011757 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011758 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011759}
11760
Thomas Wouters477c8d52006-05-27 19:21:47 +000011761PyObject *
11762PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11763{
11764 PyObject* str_obj;
11765 PyObject* sep_obj;
11766 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011767 int kind1, kind2, kind;
11768 void *buf1 = NULL, *buf2 = NULL;
11769 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011770
11771 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020011772 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011773 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011774 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011775 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011776 Py_DECREF(str_obj);
11777 return NULL;
11778 }
11779
Victor Stinner14f8f022011-10-05 20:58:25 +020011780 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011781 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020011782 kind = Py_MAX(kind1, kind2);
11783 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011784 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020011785 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011786 if (!buf1)
11787 goto onError;
11788 buf2 = PyUnicode_DATA(sep_obj);
11789 if (kind2 != kind)
11790 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11791 if (!buf2)
11792 goto onError;
11793 len1 = PyUnicode_GET_LENGTH(str_obj);
11794 len2 = PyUnicode_GET_LENGTH(sep_obj);
11795
Victor Stinner14f8f022011-10-05 20:58:25 +020011796 switch(PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011797 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020011798 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
11799 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11800 else
11801 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011802 break;
11803 case PyUnicode_2BYTE_KIND:
11804 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11805 break;
11806 case PyUnicode_4BYTE_KIND:
11807 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11808 break;
11809 default:
11810 assert(0);
11811 out = 0;
11812 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011813
11814 Py_DECREF(sep_obj);
11815 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011816 if (kind1 != kind)
11817 PyMem_Free(buf1);
11818 if (kind2 != kind)
11819 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011820
11821 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011822 onError:
11823 Py_DECREF(sep_obj);
11824 Py_DECREF(str_obj);
11825 if (kind1 != kind && buf1)
11826 PyMem_Free(buf1);
11827 if (kind2 != kind && buf2)
11828 PyMem_Free(buf2);
11829 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011830}
11831
11832
11833PyObject *
11834PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11835{
11836 PyObject* str_obj;
11837 PyObject* sep_obj;
11838 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011839 int kind1, kind2, kind;
11840 void *buf1 = NULL, *buf2 = NULL;
11841 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011842
11843 str_obj = PyUnicode_FromObject(str_in);
11844 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011845 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011846 sep_obj = PyUnicode_FromObject(sep_in);
11847 if (!sep_obj) {
11848 Py_DECREF(str_obj);
11849 return NULL;
11850 }
11851
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011852 kind1 = PyUnicode_KIND(str_in);
11853 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011854 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011855 buf1 = PyUnicode_DATA(str_in);
11856 if (kind1 != kind)
11857 buf1 = _PyUnicode_AsKind(str_in, kind);
11858 if (!buf1)
11859 goto onError;
11860 buf2 = PyUnicode_DATA(sep_obj);
11861 if (kind2 != kind)
11862 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11863 if (!buf2)
11864 goto onError;
11865 len1 = PyUnicode_GET_LENGTH(str_obj);
11866 len2 = PyUnicode_GET_LENGTH(sep_obj);
11867
11868 switch(PyUnicode_KIND(str_in)) {
11869 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020011870 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
11871 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11872 else
11873 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011874 break;
11875 case PyUnicode_2BYTE_KIND:
11876 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11877 break;
11878 case PyUnicode_4BYTE_KIND:
11879 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11880 break;
11881 default:
11882 assert(0);
11883 out = 0;
11884 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011885
11886 Py_DECREF(sep_obj);
11887 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011888 if (kind1 != kind)
11889 PyMem_Free(buf1);
11890 if (kind2 != kind)
11891 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011892
11893 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011894 onError:
11895 Py_DECREF(sep_obj);
11896 Py_DECREF(str_obj);
11897 if (kind1 != kind && buf1)
11898 PyMem_Free(buf1);
11899 if (kind2 != kind && buf2)
11900 PyMem_Free(buf2);
11901 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011902}
11903
11904PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011905 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011906\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011907Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011908the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011909found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011910
11911static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011912unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011913{
Victor Stinner9310abb2011-10-05 00:59:23 +020011914 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011915}
11916
11917PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000011918 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011919\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011920Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011921the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011922separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011923
11924static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011925unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011926{
Victor Stinner9310abb2011-10-05 00:59:23 +020011927 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011928}
11929
Alexander Belopolsky40018472011-02-26 01:02:56 +000011930PyObject *
11931PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011932{
11933 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011934
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011935 s = PyUnicode_FromObject(s);
11936 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011937 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011938 if (sep != NULL) {
11939 sep = PyUnicode_FromObject(sep);
11940 if (sep == NULL) {
11941 Py_DECREF(s);
11942 return NULL;
11943 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011944 }
11945
Victor Stinner9310abb2011-10-05 00:59:23 +020011946 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011947
11948 Py_DECREF(s);
11949 Py_XDECREF(sep);
11950 return result;
11951}
11952
11953PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011954 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011955\n\
11956Return a list of the words in S, using sep as the\n\
11957delimiter string, starting at the end of the string and\n\
11958working to the front. If maxsplit is given, at most maxsplit\n\
11959splits are done. If sep is not specified, any whitespace string\n\
11960is a separator.");
11961
11962static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011963unicode_rsplit(PyObject *self, PyObject *args)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011964{
11965 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011966 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011967
Martin v. Löwis18e16552006-02-15 17:27:45 +000011968 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011969 return NULL;
11970
11971 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011972 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011973 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020011974 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011975 else
Victor Stinner9310abb2011-10-05 00:59:23 +020011976 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011977}
11978
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011979PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011980 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011981\n\
11982Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000011983Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011984is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011985
11986static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011987unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011988{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011989 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000011990 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011991
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011992 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11993 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011994 return NULL;
11995
Guido van Rossum86662912000-04-11 15:38:46 +000011996 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011997}
11998
11999static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012000PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012001{
Walter Dörwald346737f2007-05-31 10:44:43 +000012002 if (PyUnicode_CheckExact(self)) {
12003 Py_INCREF(self);
12004 return self;
12005 } else
12006 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020012007 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012008}
12009
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012010PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012011 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012012\n\
12013Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012014and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012015
12016static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012017unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012018{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012019 return fixup(self, fixswapcase);
12020}
12021
Georg Brandlceee0772007-11-27 23:48:05 +000012022PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012023 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012024\n\
12025Return a translation table usable for str.translate().\n\
12026If there is only one argument, it must be a dictionary mapping Unicode\n\
12027ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012028Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012029If there are two arguments, they must be strings of equal length, and\n\
12030in the resulting dictionary, each character in x will be mapped to the\n\
12031character at the same position in y. If there is a third argument, it\n\
12032must be a string, whose characters will be mapped to None in the result.");
12033
12034static PyObject*
12035unicode_maketrans(PyUnicodeObject *null, PyObject *args)
12036{
12037 PyObject *x, *y = NULL, *z = NULL;
12038 PyObject *new = NULL, *key, *value;
12039 Py_ssize_t i = 0;
12040 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012041
Georg Brandlceee0772007-11-27 23:48:05 +000012042 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12043 return NULL;
12044 new = PyDict_New();
12045 if (!new)
12046 return NULL;
12047 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012048 int x_kind, y_kind, z_kind;
12049 void *x_data, *y_data, *z_data;
12050
Georg Brandlceee0772007-11-27 23:48:05 +000012051 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012052 if (!PyUnicode_Check(x)) {
12053 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12054 "be a string if there is a second argument");
12055 goto err;
12056 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012057 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012058 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12059 "arguments must have equal length");
12060 goto err;
12061 }
12062 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012063 x_kind = PyUnicode_KIND(x);
12064 y_kind = PyUnicode_KIND(y);
12065 x_data = PyUnicode_DATA(x);
12066 y_data = PyUnicode_DATA(y);
12067 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12068 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12069 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012070 if (!key || !value)
12071 goto err;
12072 res = PyDict_SetItem(new, key, value);
12073 Py_DECREF(key);
12074 Py_DECREF(value);
12075 if (res < 0)
12076 goto err;
12077 }
12078 /* create entries for deleting chars in z */
12079 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012080 z_kind = PyUnicode_KIND(z);
12081 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000012082 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012083 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012084 if (!key)
12085 goto err;
12086 res = PyDict_SetItem(new, key, Py_None);
12087 Py_DECREF(key);
12088 if (res < 0)
12089 goto err;
12090 }
12091 }
12092 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012093 int kind;
12094 void *data;
12095
Georg Brandlceee0772007-11-27 23:48:05 +000012096 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012097 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012098 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12099 "to maketrans it must be a dict");
12100 goto err;
12101 }
12102 /* copy entries into the new dict, converting string keys to int keys */
12103 while (PyDict_Next(x, &i, &key, &value)) {
12104 if (PyUnicode_Check(key)) {
12105 /* convert string keys to integer keys */
12106 PyObject *newkey;
12107 if (PyUnicode_GET_SIZE(key) != 1) {
12108 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12109 "table must be of length 1");
12110 goto err;
12111 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012112 kind = PyUnicode_KIND(key);
12113 data = PyUnicode_DATA(key);
12114 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012115 if (!newkey)
12116 goto err;
12117 res = PyDict_SetItem(new, newkey, value);
12118 Py_DECREF(newkey);
12119 if (res < 0)
12120 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012121 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012122 /* just keep integer keys */
12123 if (PyDict_SetItem(new, key, value) < 0)
12124 goto err;
12125 } else {
12126 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12127 "be strings or integers");
12128 goto err;
12129 }
12130 }
12131 }
12132 return new;
12133 err:
12134 Py_DECREF(new);
12135 return NULL;
12136}
12137
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012138PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012139 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012140\n\
12141Return a copy of the string S, where all characters have been mapped\n\
12142through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012143Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012144Unmapped characters are left untouched. Characters mapped to None\n\
12145are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012146
12147static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012148unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012149{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012150 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012151}
12152
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012153PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012154 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012155\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012156Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012157
12158static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012159unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012160{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012161 return fixup(self, fixupper);
12162}
12163
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012164PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012165 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012166\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012167Pad a numeric string S with zeros on the left, to fill a field\n\
12168of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012169
12170static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012171unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012172{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012173 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012174 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012175 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012176 int kind;
12177 void *data;
12178 Py_UCS4 chr;
12179
12180 if (PyUnicode_READY(self) == -1)
12181 return NULL;
12182
Martin v. Löwis18e16552006-02-15 17:27:45 +000012183 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012184 return NULL;
12185
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012186 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012187 if (PyUnicode_CheckExact(self)) {
12188 Py_INCREF(self);
12189 return (PyObject*) self;
12190 }
12191 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020012192 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012193 }
12194
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012195 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012196
12197 u = pad(self, fill, 0, '0');
12198
Walter Dörwald068325e2002-04-15 13:36:47 +000012199 if (u == NULL)
12200 return NULL;
12201
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012202 kind = PyUnicode_KIND(u);
12203 data = PyUnicode_DATA(u);
12204 chr = PyUnicode_READ(kind, data, fill);
12205
12206 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012207 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012208 PyUnicode_WRITE(kind, data, 0, chr);
12209 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012210 }
12211
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012212 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012213 return (PyObject*) u;
12214}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012215
12216#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012217static PyObject *
12218unicode__decimal2ascii(PyObject *self)
12219{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012220 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012221}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012222#endif
12223
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012224PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012225 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012226\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012227Return True if S starts with the specified prefix, False otherwise.\n\
12228With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012229With optional end, stop comparing S at that position.\n\
12230prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012231
12232static PyObject *
12233unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012234 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012235{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012236 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012237 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012238 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012239 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012240 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012241
Jesus Ceaac451502011-04-20 17:09:23 +020012242 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012243 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012244 if (PyTuple_Check(subobj)) {
12245 Py_ssize_t i;
12246 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12247 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012248 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012249 if (substring == NULL)
12250 return NULL;
12251 result = tailmatch(self, substring, start, end, -1);
12252 Py_DECREF(substring);
12253 if (result) {
12254 Py_RETURN_TRUE;
12255 }
12256 }
12257 /* nothing matched */
12258 Py_RETURN_FALSE;
12259 }
12260 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012261 if (substring == NULL) {
12262 if (PyErr_ExceptionMatches(PyExc_TypeError))
12263 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12264 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012265 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012266 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012267 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012268 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012269 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012270}
12271
12272
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012273PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012274 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012275\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012276Return True if S ends with the specified suffix, False otherwise.\n\
12277With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012278With optional end, stop comparing S at that position.\n\
12279suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012280
12281static PyObject *
12282unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012283 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012284{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012285 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012286 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012287 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012288 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012289 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012290
Jesus Ceaac451502011-04-20 17:09:23 +020012291 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012292 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012293 if (PyTuple_Check(subobj)) {
12294 Py_ssize_t i;
12295 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12296 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012297 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012298 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012299 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012300 result = tailmatch(self, substring, start, end, +1);
12301 Py_DECREF(substring);
12302 if (result) {
12303 Py_RETURN_TRUE;
12304 }
12305 }
12306 Py_RETURN_FALSE;
12307 }
12308 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012309 if (substring == NULL) {
12310 if (PyErr_ExceptionMatches(PyExc_TypeError))
12311 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12312 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012313 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012314 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012315 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012316 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012317 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012318}
12319
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012320#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012321
12322PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012323 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012324\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012325Return a formatted version of S, using substitutions from args and kwargs.\n\
12326The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012327
Eric Smith27bbca62010-11-04 17:06:58 +000012328PyDoc_STRVAR(format_map__doc__,
12329 "S.format_map(mapping) -> str\n\
12330\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012331Return a formatted version of S, using substitutions from mapping.\n\
12332The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012333
Eric Smith4a7d76d2008-05-30 18:10:19 +000012334static PyObject *
12335unicode__format__(PyObject* self, PyObject* args)
12336{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012337 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012338
12339 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12340 return NULL;
12341
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012342 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012343 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012344 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012345}
12346
Eric Smith8c663262007-08-25 02:26:07 +000012347PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012348 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012349\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012350Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012351
12352static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012353unicode__sizeof__(PyUnicodeObject *v)
12354{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012355 Py_ssize_t size;
12356
12357 /* If it's a compact object, account for base structure +
12358 character data. */
12359 if (PyUnicode_IS_COMPACT_ASCII(v))
12360 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12361 else if (PyUnicode_IS_COMPACT(v))
12362 size = sizeof(PyCompactUnicodeObject) +
12363 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
12364 else {
12365 /* If it is a two-block object, account for base object, and
12366 for character block if present. */
12367 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012368 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012369 size += (PyUnicode_GET_LENGTH(v) + 1) *
12370 PyUnicode_CHARACTER_SIZE(v);
12371 }
12372 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012373 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012374 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012375 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012376 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012377 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012378
12379 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012380}
12381
12382PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012383 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012384
12385static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012386unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012387{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012388 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012389 if (!copy)
12390 return NULL;
12391 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012392}
12393
Guido van Rossumd57fd912000-03-10 22:53:23 +000012394static PyMethodDef unicode_methods[] = {
12395
12396 /* Order is according to common usage: often used methods should
12397 appear first, since lookup is done sequentially. */
12398
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012399 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012400 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12401 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012402 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012403 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12404 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12405 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12406 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12407 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12408 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12409 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012410 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012411 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12412 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12413 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012414 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012415 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12416 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12417 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012418 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012419 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012420 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012421 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012422 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12423 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12424 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12425 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12426 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12427 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12428 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12429 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12430 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12431 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12432 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12433 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12434 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12435 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012436 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012437 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012438 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012439 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012440 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012441 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012442 {"maketrans", (PyCFunction) unicode_maketrans,
12443 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012444 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012445#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012446 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012447#endif
12448
12449#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012450 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012451 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012452#endif
12453
Benjamin Peterson14339b62009-01-31 16:36:08 +000012454 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012455 {NULL, NULL}
12456};
12457
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012458static PyObject *
12459unicode_mod(PyObject *v, PyObject *w)
12460{
Brian Curtindfc80e32011-08-10 20:28:54 -050012461 if (!PyUnicode_Check(v))
12462 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012463 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012464}
12465
12466static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012467 0, /*nb_add*/
12468 0, /*nb_subtract*/
12469 0, /*nb_multiply*/
12470 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012471};
12472
Guido van Rossumd57fd912000-03-10 22:53:23 +000012473static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012474 (lenfunc) unicode_length, /* sq_length */
12475 PyUnicode_Concat, /* sq_concat */
12476 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12477 (ssizeargfunc) unicode_getitem, /* sq_item */
12478 0, /* sq_slice */
12479 0, /* sq_ass_item */
12480 0, /* sq_ass_slice */
12481 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012482};
12483
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012484static PyObject*
12485unicode_subscript(PyUnicodeObject* self, PyObject* item)
12486{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012487 if (PyUnicode_READY(self) == -1)
12488 return NULL;
12489
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000012490 if (PyIndex_Check(item)) {
12491 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012492 if (i == -1 && PyErr_Occurred())
12493 return NULL;
12494 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012495 i += PyUnicode_GET_LENGTH(self);
Victor Stinner2fe5ced2011-10-02 00:25:40 +020012496 return unicode_getitem((PyObject*)self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012497 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000012498 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012499 PyObject *result;
12500 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012501 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012502 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012503
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012504 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000012505 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012506 return NULL;
12507 }
12508
12509 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012510 return PyUnicode_New(0, 0);
12511 } else if (start == 0 && step == 1 &&
12512 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000012513 PyUnicode_CheckExact(self)) {
12514 Py_INCREF(self);
12515 return (PyObject *)self;
12516 } else if (step == 1) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012517 return PyUnicode_Substring((PyObject*)self,
12518 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012519 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012520 /* General case */
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012521 max_char = 0;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012522 src_kind = PyUnicode_KIND(self);
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012523 kind_limit = kind_maxchar_limit(src_kind);
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012524 src_data = PyUnicode_DATA(self);
12525 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12526 ch = PyUnicode_READ(src_kind, src_data, cur);
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012527 if (ch > max_char) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012528 max_char = ch;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012529 if (max_char >= kind_limit)
12530 break;
12531 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012532 }
12533 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012534 if (result == NULL)
12535 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012536 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012537 dest_data = PyUnicode_DATA(result);
12538
12539 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012540 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
12541 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012542 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012543 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012544 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012545 } else {
12546 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12547 return NULL;
12548 }
12549}
12550
12551static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012552 (lenfunc)unicode_length, /* mp_length */
12553 (binaryfunc)unicode_subscript, /* mp_subscript */
12554 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012555};
12556
Guido van Rossumd57fd912000-03-10 22:53:23 +000012557
Guido van Rossumd57fd912000-03-10 22:53:23 +000012558/* Helpers for PyUnicode_Format() */
12559
12560static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000012561getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012562{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012563 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012564 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012565 (*p_argidx)++;
12566 if (arglen < 0)
12567 return args;
12568 else
12569 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012570 }
12571 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012572 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012573 return NULL;
12574}
12575
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012576/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012577
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012578static PyObject *
12579formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012580{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012581 char *p;
12582 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012583 double x;
Tim Petersced69f82003-09-16 20:30:58 +000012584
Guido van Rossumd57fd912000-03-10 22:53:23 +000012585 x = PyFloat_AsDouble(v);
12586 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012587 return NULL;
12588
Guido van Rossumd57fd912000-03-10 22:53:23 +000012589 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012590 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000012591
Eric Smith0923d1d2009-04-16 20:16:10 +000012592 p = PyOS_double_to_string(x, type, prec,
12593 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012594 if (p == NULL)
12595 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012596 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000012597 PyMem_Free(p);
12598 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012599}
12600
Tim Peters38fd5b62000-09-21 05:43:11 +000012601static PyObject*
12602formatlong(PyObject *val, int flags, int prec, int type)
12603{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012604 char *buf;
12605 int len;
12606 PyObject *str; /* temporary string object. */
12607 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012608
Benjamin Peterson14339b62009-01-31 16:36:08 +000012609 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
12610 if (!str)
12611 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012612 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012613 Py_DECREF(str);
12614 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012615}
12616
Guido van Rossumd57fd912000-03-10 22:53:23 +000012617static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012618formatchar(Py_UCS4 *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012619 size_t buflen,
12620 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012621{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012622 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012623 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012624 if (PyUnicode_GET_LENGTH(v) == 1) {
12625 buf[0] = PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000012626 buf[1] = '\0';
12627 return 1;
12628 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012629 goto onError;
12630 }
12631 else {
12632 /* Integer input truncated to a character */
12633 long x;
12634 x = PyLong_AsLong(v);
12635 if (x == -1 && PyErr_Occurred())
12636 goto onError;
12637
12638 if (x < 0 || x > 0x10ffff) {
12639 PyErr_SetString(PyExc_OverflowError,
12640 "%c arg not in range(0x110000)");
12641 return -1;
12642 }
12643
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012644 buf[0] = (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012645 buf[1] = '\0';
12646 return 1;
12647 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012648
Benjamin Peterson29060642009-01-31 22:14:21 +000012649 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012650 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012651 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012652 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012653}
12654
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012655/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012656 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012657*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012658#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012659
Alexander Belopolsky40018472011-02-26 01:02:56 +000012660PyObject *
12661PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012662{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012663 void *fmt;
12664 int fmtkind;
12665 PyObject *result;
12666 Py_UCS4 *res, *res0;
12667 Py_UCS4 max;
12668 int kind;
12669 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012670 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012671 PyObject *dict = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012672 PyUnicodeObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +000012673
Guido van Rossumd57fd912000-03-10 22:53:23 +000012674 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012675 PyErr_BadInternalCall();
12676 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012677 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012678 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
12679 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012680 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012681 fmt = PyUnicode_DATA(uformat);
12682 fmtkind = PyUnicode_KIND(uformat);
12683 fmtcnt = PyUnicode_GET_LENGTH(uformat);
12684 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012685
12686 reslen = rescnt = fmtcnt + 100;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012687 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
12688 if (res0 == NULL) {
12689 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012690 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012691 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012692
12693 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012694 arglen = PyTuple_Size(args);
12695 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012696 }
12697 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012698 arglen = -1;
12699 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012700 }
Christian Heimes90aa7642007-12-19 02:45:37 +000012701 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000012702 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000012703 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012704
12705 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012706 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012707 if (--rescnt < 0) {
12708 rescnt = fmtcnt + 100;
12709 reslen += rescnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012710 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12711 if (res0 == NULL){
12712 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012713 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012714 }
12715 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012716 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012717 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012718 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012719 }
12720 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012721 /* Got a format specifier */
12722 int flags = 0;
12723 Py_ssize_t width = -1;
12724 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012725 Py_UCS4 c = '\0';
12726 Py_UCS4 fill;
Benjamin Peterson29060642009-01-31 22:14:21 +000012727 int isnumok;
12728 PyObject *v = NULL;
12729 PyObject *temp = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012730 void *pbuf;
12731 Py_ssize_t pindex;
Benjamin Peterson29060642009-01-31 22:14:21 +000012732 Py_UNICODE sign;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012733 Py_ssize_t len, len1;
12734 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012735
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012736 fmtpos++;
12737 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12738 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000012739 Py_ssize_t keylen;
12740 PyObject *key;
12741 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000012742
Benjamin Peterson29060642009-01-31 22:14:21 +000012743 if (dict == NULL) {
12744 PyErr_SetString(PyExc_TypeError,
12745 "format requires a mapping");
12746 goto onError;
12747 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012748 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012749 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012750 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012751 /* Skip over balanced parentheses */
12752 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012753 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012754 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012755 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012756 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012757 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012758 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012759 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012760 if (fmtcnt < 0 || pcount > 0) {
12761 PyErr_SetString(PyExc_ValueError,
12762 "incomplete format key");
12763 goto onError;
12764 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020012765 key = PyUnicode_Substring((PyObject*)uformat,
12766 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012767 if (key == NULL)
12768 goto onError;
12769 if (args_owned) {
12770 Py_DECREF(args);
12771 args_owned = 0;
12772 }
12773 args = PyObject_GetItem(dict, key);
12774 Py_DECREF(key);
12775 if (args == NULL) {
12776 goto onError;
12777 }
12778 args_owned = 1;
12779 arglen = -1;
12780 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012781 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012782 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012783 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012784 case '-': flags |= F_LJUST; continue;
12785 case '+': flags |= F_SIGN; continue;
12786 case ' ': flags |= F_BLANK; continue;
12787 case '#': flags |= F_ALT; continue;
12788 case '0': flags |= F_ZERO; continue;
12789 }
12790 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012791 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012792 if (c == '*') {
12793 v = getnextarg(args, arglen, &argidx);
12794 if (v == NULL)
12795 goto onError;
12796 if (!PyLong_Check(v)) {
12797 PyErr_SetString(PyExc_TypeError,
12798 "* wants int");
12799 goto onError;
12800 }
12801 width = PyLong_AsLong(v);
12802 if (width == -1 && PyErr_Occurred())
12803 goto onError;
12804 if (width < 0) {
12805 flags |= F_LJUST;
12806 width = -width;
12807 }
12808 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012809 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012810 }
12811 else if (c >= '0' && c <= '9') {
12812 width = c - '0';
12813 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012814 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012815 if (c < '0' || c > '9')
12816 break;
12817 if ((width*10) / 10 != width) {
12818 PyErr_SetString(PyExc_ValueError,
12819 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012820 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012821 }
12822 width = width*10 + (c - '0');
12823 }
12824 }
12825 if (c == '.') {
12826 prec = 0;
12827 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012828 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012829 if (c == '*') {
12830 v = getnextarg(args, arglen, &argidx);
12831 if (v == NULL)
12832 goto onError;
12833 if (!PyLong_Check(v)) {
12834 PyErr_SetString(PyExc_TypeError,
12835 "* wants int");
12836 goto onError;
12837 }
12838 prec = PyLong_AsLong(v);
12839 if (prec == -1 && PyErr_Occurred())
12840 goto onError;
12841 if (prec < 0)
12842 prec = 0;
12843 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012844 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012845 }
12846 else if (c >= '0' && c <= '9') {
12847 prec = c - '0';
12848 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012849 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012850 if (c < '0' || c > '9')
12851 break;
12852 if ((prec*10) / 10 != prec) {
12853 PyErr_SetString(PyExc_ValueError,
12854 "prec too big");
12855 goto onError;
12856 }
12857 prec = prec*10 + (c - '0');
12858 }
12859 }
12860 } /* prec */
12861 if (fmtcnt >= 0) {
12862 if (c == 'h' || c == 'l' || c == 'L') {
12863 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012864 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012865 }
12866 }
12867 if (fmtcnt < 0) {
12868 PyErr_SetString(PyExc_ValueError,
12869 "incomplete format");
12870 goto onError;
12871 }
12872 if (c != '%') {
12873 v = getnextarg(args, arglen, &argidx);
12874 if (v == NULL)
12875 goto onError;
12876 }
12877 sign = 0;
12878 fill = ' ';
12879 switch (c) {
12880
12881 case '%':
12882 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012883 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012884 /* presume that buffer length is at least 1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012885 PyUnicode_WRITE(kind, pbuf, 0, '%');
Benjamin Peterson29060642009-01-31 22:14:21 +000012886 len = 1;
12887 break;
12888
12889 case 's':
12890 case 'r':
12891 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012892 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012893 temp = v;
12894 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012895 }
12896 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012897 if (c == 's')
12898 temp = PyObject_Str(v);
12899 else if (c == 'r')
12900 temp = PyObject_Repr(v);
12901 else
12902 temp = PyObject_ASCII(v);
12903 if (temp == NULL)
12904 goto onError;
12905 if (PyUnicode_Check(temp))
12906 /* nothing to do */;
12907 else {
12908 Py_DECREF(temp);
12909 PyErr_SetString(PyExc_TypeError,
12910 "%s argument has non-string str()");
12911 goto onError;
12912 }
12913 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012914 if (PyUnicode_READY(temp) == -1) {
12915 Py_CLEAR(temp);
12916 goto onError;
12917 }
12918 pbuf = PyUnicode_DATA(temp);
12919 kind = PyUnicode_KIND(temp);
12920 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012921 if (prec >= 0 && len > prec)
12922 len = prec;
12923 break;
12924
12925 case 'i':
12926 case 'd':
12927 case 'u':
12928 case 'o':
12929 case 'x':
12930 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000012931 isnumok = 0;
12932 if (PyNumber_Check(v)) {
12933 PyObject *iobj=NULL;
12934
12935 if (PyLong_Check(v)) {
12936 iobj = v;
12937 Py_INCREF(iobj);
12938 }
12939 else {
12940 iobj = PyNumber_Long(v);
12941 }
12942 if (iobj!=NULL) {
12943 if (PyLong_Check(iobj)) {
12944 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070012945 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000012946 Py_DECREF(iobj);
12947 if (!temp)
12948 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012949 if (PyUnicode_READY(temp) == -1) {
12950 Py_CLEAR(temp);
12951 goto onError;
12952 }
12953 pbuf = PyUnicode_DATA(temp);
12954 kind = PyUnicode_KIND(temp);
12955 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012956 sign = 1;
12957 }
12958 else {
12959 Py_DECREF(iobj);
12960 }
12961 }
12962 }
12963 if (!isnumok) {
12964 PyErr_Format(PyExc_TypeError,
12965 "%%%c format: a number is required, "
12966 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12967 goto onError;
12968 }
12969 if (flags & F_ZERO)
12970 fill = '0';
12971 break;
12972
12973 case 'e':
12974 case 'E':
12975 case 'f':
12976 case 'F':
12977 case 'g':
12978 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012979 temp = formatfloat(v, flags, prec, c);
12980 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000012981 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012982 if (PyUnicode_READY(temp) == -1) {
12983 Py_CLEAR(temp);
12984 goto onError;
12985 }
12986 pbuf = PyUnicode_DATA(temp);
12987 kind = PyUnicode_KIND(temp);
12988 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012989 sign = 1;
12990 if (flags & F_ZERO)
12991 fill = '0';
12992 break;
12993
12994 case 'c':
12995 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012996 kind = PyUnicode_4BYTE_KIND;
Victor Stinnerb9dcffb2011-09-29 00:39:24 +020012997 len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v);
Benjamin Peterson29060642009-01-31 22:14:21 +000012998 if (len < 0)
12999 goto onError;
13000 break;
13001
13002 default:
13003 PyErr_Format(PyExc_ValueError,
13004 "unsupported format character '%c' (0x%x) "
13005 "at index %zd",
13006 (31<=c && c<=126) ? (char)c : '?',
13007 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013008 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013009 goto onError;
13010 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013011 /* pbuf is initialized here. */
13012 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013013 if (sign) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013014 if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
13015 PyUnicode_READ(kind, pbuf, pindex) == '+') {
13016 sign = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013017 len--;
13018 }
13019 else if (flags & F_SIGN)
13020 sign = '+';
13021 else if (flags & F_BLANK)
13022 sign = ' ';
13023 else
13024 sign = 0;
13025 }
13026 if (width < len)
13027 width = len;
13028 if (rescnt - (sign != 0) < width) {
13029 reslen -= rescnt;
13030 rescnt = width + fmtcnt + 100;
13031 reslen += rescnt;
13032 if (reslen < 0) {
13033 Py_XDECREF(temp);
13034 PyErr_NoMemory();
13035 goto onError;
13036 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013037 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
13038 if (res0 == 0) {
13039 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000013040 Py_XDECREF(temp);
13041 goto onError;
13042 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013043 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000013044 }
13045 if (sign) {
13046 if (fill != ' ')
13047 *res++ = sign;
13048 rescnt--;
13049 if (width > len)
13050 width--;
13051 }
13052 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013053 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13054 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013055 if (fill != ' ') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013056 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
13057 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013058 }
13059 rescnt -= 2;
13060 width -= 2;
13061 if (width < 0)
13062 width = 0;
13063 len -= 2;
13064 }
13065 if (width > len && !(flags & F_LJUST)) {
13066 do {
13067 --rescnt;
13068 *res++ = fill;
13069 } while (--width > len);
13070 }
13071 if (fill == ' ') {
13072 if (sign)
13073 *res++ = sign;
13074 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013075 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13076 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
13077 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
13078 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013079 }
13080 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013081 /* Copy all characters, preserving len */
13082 len1 = len;
13083 while (len1--) {
13084 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
13085 rescnt--;
13086 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013087 while (--width >= len) {
13088 --rescnt;
13089 *res++ = ' ';
13090 }
13091 if (dict && (argidx < arglen) && c != '%') {
13092 PyErr_SetString(PyExc_TypeError,
13093 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +000013094 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013095 goto onError;
13096 }
13097 Py_XDECREF(temp);
13098 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013099 } /* until end */
13100 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013101 PyErr_SetString(PyExc_TypeError,
13102 "not all arguments converted during string formatting");
13103 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013104 }
13105
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013106
13107 for (max=0, res = res0; res < res0+reslen-rescnt; res++)
13108 if (*res > max)
13109 max = *res;
13110 result = PyUnicode_New(reslen - rescnt, max);
13111 if (!result)
Benjamin Peterson29060642009-01-31 22:14:21 +000013112 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013113 kind = PyUnicode_KIND(result);
13114 for (res = res0; res < res0+reslen-rescnt; res++)
13115 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
13116 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013117 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013118 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013119 }
13120 Py_DECREF(uformat);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013121 assert(_PyUnicode_CheckConsistency(result, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000013122 return (PyObject *)result;
13123
Benjamin Peterson29060642009-01-31 22:14:21 +000013124 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013125 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013126 Py_DECREF(uformat);
13127 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013128 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013129 }
13130 return NULL;
13131}
13132
Jeremy Hylton938ace62002-07-17 16:30:39 +000013133static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013134unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13135
Tim Peters6d6c1a32001-08-02 04:15:00 +000013136static PyObject *
13137unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13138{
Benjamin Peterson29060642009-01-31 22:14:21 +000013139 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013140 static char *kwlist[] = {"object", "encoding", "errors", 0};
13141 char *encoding = NULL;
13142 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013143
Benjamin Peterson14339b62009-01-31 16:36:08 +000013144 if (type != &PyUnicode_Type)
13145 return unicode_subtype_new(type, args, kwds);
13146 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013147 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013148 return NULL;
13149 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013150 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013151 if (encoding == NULL && errors == NULL)
13152 return PyObject_Str(x);
13153 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013154 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013155}
13156
Guido van Rossume023fe02001-08-30 03:12:59 +000013157static PyObject *
13158unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13159{
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013160 PyUnicodeObject *unicode, *self;
13161 Py_ssize_t length, char_size;
13162 int share_wstr, share_utf8;
13163 unsigned int kind;
13164 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013165
Benjamin Peterson14339b62009-01-31 16:36:08 +000013166 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013167
13168 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
13169 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013170 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013171 assert(_PyUnicode_CHECK(unicode));
Victor Stinnere06e1452011-10-04 20:52:31 +020013172 if (PyUnicode_READY(unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013173 return NULL;
13174
13175 self = (PyUnicodeObject *) type->tp_alloc(type, 0);
13176 if (self == NULL) {
13177 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013178 return NULL;
13179 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013180 kind = PyUnicode_KIND(unicode);
13181 length = PyUnicode_GET_LENGTH(unicode);
13182
13183 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013184#ifdef Py_DEBUG
13185 _PyUnicode_HASH(self) = -1;
13186#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013187 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013188#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013189 _PyUnicode_STATE(self).interned = 0;
13190 _PyUnicode_STATE(self).kind = kind;
13191 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013192 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013193 _PyUnicode_STATE(self).ready = 1;
13194 _PyUnicode_WSTR(self) = NULL;
13195 _PyUnicode_UTF8_LENGTH(self) = 0;
13196 _PyUnicode_UTF8(self) = NULL;
13197 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013198 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013199
13200 share_utf8 = 0;
13201 share_wstr = 0;
13202 if (kind == PyUnicode_1BYTE_KIND) {
13203 char_size = 1;
13204 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13205 share_utf8 = 1;
13206 }
13207 else if (kind == PyUnicode_2BYTE_KIND) {
13208 char_size = 2;
13209 if (sizeof(wchar_t) == 2)
13210 share_wstr = 1;
13211 }
13212 else {
13213 assert(kind == PyUnicode_4BYTE_KIND);
13214 char_size = 4;
13215 if (sizeof(wchar_t) == 4)
13216 share_wstr = 1;
13217 }
13218
13219 /* Ensure we won't overflow the length. */
13220 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13221 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013222 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013223 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013224 data = PyObject_MALLOC((length + 1) * char_size);
13225 if (data == NULL) {
13226 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013227 goto onError;
13228 }
13229
Victor Stinnerc3c74152011-10-02 20:39:55 +020013230 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013231 if (share_utf8) {
13232 _PyUnicode_UTF8_LENGTH(self) = length;
13233 _PyUnicode_UTF8(self) = data;
13234 }
13235 if (share_wstr) {
13236 _PyUnicode_WSTR_LENGTH(self) = length;
13237 _PyUnicode_WSTR(self) = (wchar_t *)data;
13238 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013239
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013240 Py_MEMCPY(data, PyUnicode_DATA(unicode),
13241 PyUnicode_KIND_SIZE(kind, length + 1));
13242 Py_DECREF(unicode);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013243 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013244#ifdef Py_DEBUG
13245 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13246#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013247 return (PyObject *)self;
13248
13249onError:
13250 Py_DECREF(unicode);
13251 Py_DECREF(self);
13252 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013253}
13254
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013255PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013256 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013257\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013258Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013259encoding defaults to the current default string encoding.\n\
13260errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013261
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013262static PyObject *unicode_iter(PyObject *seq);
13263
Guido van Rossumd57fd912000-03-10 22:53:23 +000013264PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013265 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013266 "str", /* tp_name */
13267 sizeof(PyUnicodeObject), /* tp_size */
13268 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013269 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013270 (destructor)unicode_dealloc, /* tp_dealloc */
13271 0, /* tp_print */
13272 0, /* tp_getattr */
13273 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013274 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013275 unicode_repr, /* tp_repr */
13276 &unicode_as_number, /* tp_as_number */
13277 &unicode_as_sequence, /* tp_as_sequence */
13278 &unicode_as_mapping, /* tp_as_mapping */
13279 (hashfunc) unicode_hash, /* tp_hash*/
13280 0, /* tp_call*/
13281 (reprfunc) unicode_str, /* tp_str */
13282 PyObject_GenericGetAttr, /* tp_getattro */
13283 0, /* tp_setattro */
13284 0, /* tp_as_buffer */
13285 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013286 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013287 unicode_doc, /* tp_doc */
13288 0, /* tp_traverse */
13289 0, /* tp_clear */
13290 PyUnicode_RichCompare, /* tp_richcompare */
13291 0, /* tp_weaklistoffset */
13292 unicode_iter, /* tp_iter */
13293 0, /* tp_iternext */
13294 unicode_methods, /* tp_methods */
13295 0, /* tp_members */
13296 0, /* tp_getset */
13297 &PyBaseObject_Type, /* tp_base */
13298 0, /* tp_dict */
13299 0, /* tp_descr_get */
13300 0, /* tp_descr_set */
13301 0, /* tp_dictoffset */
13302 0, /* tp_init */
13303 0, /* tp_alloc */
13304 unicode_new, /* tp_new */
13305 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013306};
13307
13308/* Initialize the Unicode implementation */
13309
Thomas Wouters78890102000-07-22 19:25:51 +000013310void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013311{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013312 int i;
13313
Thomas Wouters477c8d52006-05-27 19:21:47 +000013314 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013315 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013316 0x000A, /* LINE FEED */
13317 0x000D, /* CARRIAGE RETURN */
13318 0x001C, /* FILE SEPARATOR */
13319 0x001D, /* GROUP SEPARATOR */
13320 0x001E, /* RECORD SEPARATOR */
13321 0x0085, /* NEXT LINE */
13322 0x2028, /* LINE SEPARATOR */
13323 0x2029, /* PARAGRAPH SEPARATOR */
13324 };
13325
Fred Drakee4315f52000-05-09 19:53:39 +000013326 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013327 unicode_empty = PyUnicode_New(0, 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013328 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013329 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013330 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013331
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013332 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013333 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013334 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013335 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013336
13337 /* initialize the linebreak bloom filter */
13338 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013339 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013340 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013341
13342 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013343}
13344
13345/* Finalize the Unicode implementation */
13346
Christian Heimesa156e092008-02-16 07:38:31 +000013347int
13348PyUnicode_ClearFreeList(void)
13349{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013350 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013351}
13352
Guido van Rossumd57fd912000-03-10 22:53:23 +000013353void
Thomas Wouters78890102000-07-22 19:25:51 +000013354_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013355{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013356 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013357
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013358 Py_XDECREF(unicode_empty);
13359 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013360
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013361 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013362 if (unicode_latin1[i]) {
13363 Py_DECREF(unicode_latin1[i]);
13364 unicode_latin1[i] = NULL;
13365 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013366 }
Christian Heimesa156e092008-02-16 07:38:31 +000013367 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013368}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013369
Walter Dörwald16807132007-05-25 13:52:07 +000013370void
13371PyUnicode_InternInPlace(PyObject **p)
13372{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013373 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
13374 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013375#ifdef Py_DEBUG
13376 assert(s != NULL);
13377 assert(_PyUnicode_CHECK(s));
13378#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013379 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013380 return;
13381#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013382 /* If it's a subclass, we don't really know what putting
13383 it in the interned dict might do. */
13384 if (!PyUnicode_CheckExact(s))
13385 return;
13386 if (PyUnicode_CHECK_INTERNED(s))
13387 return;
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013388 if (_PyUnicode_READY_REPLACE(p)) {
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013389 assert(0 && "_PyUnicode_READY_REPLACE fail in PyUnicode_InternInPlace");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013390 return;
13391 }
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013392 s = (PyUnicodeObject *)(*p);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013393 if (interned == NULL) {
13394 interned = PyDict_New();
13395 if (interned == NULL) {
13396 PyErr_Clear(); /* Don't leave an exception */
13397 return;
13398 }
13399 }
13400 /* It might be that the GetItem call fails even
13401 though the key is present in the dictionary,
13402 namely when this happens during a stack overflow. */
13403 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000013404 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013405 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000013406
Benjamin Peterson29060642009-01-31 22:14:21 +000013407 if (t) {
13408 Py_INCREF(t);
13409 Py_DECREF(*p);
13410 *p = t;
13411 return;
13412 }
Walter Dörwald16807132007-05-25 13:52:07 +000013413
Benjamin Peterson14339b62009-01-31 16:36:08 +000013414 PyThreadState_GET()->recursion_critical = 1;
13415 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
13416 PyErr_Clear();
13417 PyThreadState_GET()->recursion_critical = 0;
13418 return;
13419 }
13420 PyThreadState_GET()->recursion_critical = 0;
13421 /* The two references in interned are not counted by refcnt.
13422 The deallocator will take care of this */
13423 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013424 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013425}
13426
13427void
13428PyUnicode_InternImmortal(PyObject **p)
13429{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013430 PyUnicodeObject *u = (PyUnicodeObject *)*p;
13431
Benjamin Peterson14339b62009-01-31 16:36:08 +000013432 PyUnicode_InternInPlace(p);
13433 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013434 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013435 Py_INCREF(*p);
13436 }
Walter Dörwald16807132007-05-25 13:52:07 +000013437}
13438
13439PyObject *
13440PyUnicode_InternFromString(const char *cp)
13441{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013442 PyObject *s = PyUnicode_FromString(cp);
13443 if (s == NULL)
13444 return NULL;
13445 PyUnicode_InternInPlace(&s);
13446 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000013447}
13448
Alexander Belopolsky40018472011-02-26 01:02:56 +000013449void
13450_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000013451{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013452 PyObject *keys;
13453 PyUnicodeObject *s;
13454 Py_ssize_t i, n;
13455 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000013456
Benjamin Peterson14339b62009-01-31 16:36:08 +000013457 if (interned == NULL || !PyDict_Check(interned))
13458 return;
13459 keys = PyDict_Keys(interned);
13460 if (keys == NULL || !PyList_Check(keys)) {
13461 PyErr_Clear();
13462 return;
13463 }
Walter Dörwald16807132007-05-25 13:52:07 +000013464
Benjamin Peterson14339b62009-01-31 16:36:08 +000013465 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13466 detector, interned unicode strings are not forcibly deallocated;
13467 rather, we give them their stolen references back, and then clear
13468 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000013469
Benjamin Peterson14339b62009-01-31 16:36:08 +000013470 n = PyList_GET_SIZE(keys);
13471 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000013472 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013473 for (i = 0; i < n; i++) {
13474 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013475 if (PyUnicode_READY(s) == -1) {
13476 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013477 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013478 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013479 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013480 case SSTATE_NOT_INTERNED:
13481 /* XXX Shouldn't happen */
13482 break;
13483 case SSTATE_INTERNED_IMMORTAL:
13484 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013485 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013486 break;
13487 case SSTATE_INTERNED_MORTAL:
13488 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013489 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013490 break;
13491 default:
13492 Py_FatalError("Inconsistent interned string state.");
13493 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013494 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013495 }
13496 fprintf(stderr, "total size of all interned strings: "
13497 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13498 "mortal/immortal\n", mortal_size, immortal_size);
13499 Py_DECREF(keys);
13500 PyDict_Clear(interned);
13501 Py_DECREF(interned);
13502 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000013503}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013504
13505
13506/********************* Unicode Iterator **************************/
13507
13508typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013509 PyObject_HEAD
13510 Py_ssize_t it_index;
13511 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013512} unicodeiterobject;
13513
13514static void
13515unicodeiter_dealloc(unicodeiterobject *it)
13516{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013517 _PyObject_GC_UNTRACK(it);
13518 Py_XDECREF(it->it_seq);
13519 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013520}
13521
13522static int
13523unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13524{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013525 Py_VISIT(it->it_seq);
13526 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013527}
13528
13529static PyObject *
13530unicodeiter_next(unicodeiterobject *it)
13531{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013532 PyUnicodeObject *seq;
13533 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013534
Benjamin Peterson14339b62009-01-31 16:36:08 +000013535 assert(it != NULL);
13536 seq = it->it_seq;
13537 if (seq == NULL)
13538 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013539 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013540
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013541 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
13542 int kind = PyUnicode_KIND(seq);
13543 void *data = PyUnicode_DATA(seq);
13544 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
13545 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013546 if (item != NULL)
13547 ++it->it_index;
13548 return item;
13549 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013550
Benjamin Peterson14339b62009-01-31 16:36:08 +000013551 Py_DECREF(seq);
13552 it->it_seq = NULL;
13553 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013554}
13555
13556static PyObject *
13557unicodeiter_len(unicodeiterobject *it)
13558{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013559 Py_ssize_t len = 0;
13560 if (it->it_seq)
13561 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
13562 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013563}
13564
13565PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
13566
13567static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013568 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000013569 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000013570 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013571};
13572
13573PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013574 PyVarObject_HEAD_INIT(&PyType_Type, 0)
13575 "str_iterator", /* tp_name */
13576 sizeof(unicodeiterobject), /* tp_basicsize */
13577 0, /* tp_itemsize */
13578 /* methods */
13579 (destructor)unicodeiter_dealloc, /* tp_dealloc */
13580 0, /* tp_print */
13581 0, /* tp_getattr */
13582 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013583 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013584 0, /* tp_repr */
13585 0, /* tp_as_number */
13586 0, /* tp_as_sequence */
13587 0, /* tp_as_mapping */
13588 0, /* tp_hash */
13589 0, /* tp_call */
13590 0, /* tp_str */
13591 PyObject_GenericGetAttr, /* tp_getattro */
13592 0, /* tp_setattro */
13593 0, /* tp_as_buffer */
13594 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
13595 0, /* tp_doc */
13596 (traverseproc)unicodeiter_traverse, /* tp_traverse */
13597 0, /* tp_clear */
13598 0, /* tp_richcompare */
13599 0, /* tp_weaklistoffset */
13600 PyObject_SelfIter, /* tp_iter */
13601 (iternextfunc)unicodeiter_next, /* tp_iternext */
13602 unicodeiter_methods, /* tp_methods */
13603 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013604};
13605
13606static PyObject *
13607unicode_iter(PyObject *seq)
13608{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013609 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013610
Benjamin Peterson14339b62009-01-31 16:36:08 +000013611 if (!PyUnicode_Check(seq)) {
13612 PyErr_BadInternalCall();
13613 return NULL;
13614 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013615 if (PyUnicode_READY(seq) == -1)
13616 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013617 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
13618 if (it == NULL)
13619 return NULL;
13620 it->it_index = 0;
13621 Py_INCREF(seq);
13622 it->it_seq = (PyUnicodeObject *)seq;
13623 _PyObject_GC_TRACK(it);
13624 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013625}
13626
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013627#define UNIOP(x) Py_UNICODE_##x
13628#define UNIOP_t Py_UNICODE
13629#include "uniops.h"
13630#undef UNIOP
13631#undef UNIOP_t
13632#define UNIOP(x) Py_UCS4_##x
13633#define UNIOP_t Py_UCS4
13634#include "uniops.h"
13635#undef UNIOP
13636#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000013637
Victor Stinner71133ff2010-09-01 23:43:53 +000013638Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000013639PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000013640{
13641 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
13642 Py_UNICODE *copy;
13643 Py_ssize_t size;
13644
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013645 if (!PyUnicode_Check(unicode)) {
13646 PyErr_BadArgument();
13647 return NULL;
13648 }
Victor Stinner71133ff2010-09-01 23:43:53 +000013649 /* Ensure we won't overflow the size. */
13650 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
13651 PyErr_NoMemory();
13652 return NULL;
13653 }
13654 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
13655 size *= sizeof(Py_UNICODE);
13656 copy = PyMem_Malloc(size);
13657 if (copy == NULL) {
13658 PyErr_NoMemory();
13659 return NULL;
13660 }
13661 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
13662 return copy;
13663}
Martin v. Löwis5b222132007-06-10 09:51:05 +000013664
Georg Brandl66c221e2010-10-14 07:04:07 +000013665/* A _string module, to export formatter_parser and formatter_field_name_split
13666 to the string.Formatter class implemented in Python. */
13667
13668static PyMethodDef _string_methods[] = {
13669 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
13670 METH_O, PyDoc_STR("split the argument as a field name")},
13671 {"formatter_parser", (PyCFunction) formatter_parser,
13672 METH_O, PyDoc_STR("parse the argument as a format string")},
13673 {NULL, NULL}
13674};
13675
13676static struct PyModuleDef _string_module = {
13677 PyModuleDef_HEAD_INIT,
13678 "_string",
13679 PyDoc_STR("string helper module"),
13680 0,
13681 _string_methods,
13682 NULL,
13683 NULL,
13684 NULL,
13685 NULL
13686};
13687
13688PyMODINIT_FUNC
13689PyInit__string(void)
13690{
13691 return PyModule_Create(&_string_module);
13692}
13693
13694
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013695#ifdef __cplusplus
13696}
13697#endif