blob: 4e4e53cfd483082f43d2f47ec089c26a5f293ab5 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Victor Stinnerce5faf62011-10-05 00:42:43 +020049#ifdef Py_DEBUG
50# define DONT_MAKE_RESULT_READY
51#endif
52
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Victor Stinner910337b2011-10-03 03:20:16 +020096#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020097# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020098#else
99# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
100#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200101
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102#define _PyUnicode_UTF8(op) \
103 (((PyCompactUnicodeObject*)(op))->utf8)
104#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200105 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200106 assert(PyUnicode_IS_READY(op)), \
107 PyUnicode_IS_COMPACT_ASCII(op) ? \
108 ((char*)((PyASCIIObject*)(op) + 1)) : \
109 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200110#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200111 (((PyCompactUnicodeObject*)(op))->utf8_length)
112#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200113 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200114 assert(PyUnicode_IS_READY(op)), \
115 PyUnicode_IS_COMPACT_ASCII(op) ? \
116 ((PyASCIIObject*)(op))->length : \
117 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200118#define _PyUnicode_WSTR(op) \
119 (((PyASCIIObject*)(op))->wstr)
120#define _PyUnicode_WSTR_LENGTH(op) \
121 (((PyCompactUnicodeObject*)(op))->wstr_length)
122#define _PyUnicode_LENGTH(op) \
123 (((PyASCIIObject *)(op))->length)
124#define _PyUnicode_STATE(op) \
125 (((PyASCIIObject *)(op))->state)
126#define _PyUnicode_HASH(op) \
127 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200128#define _PyUnicode_KIND(op) \
129 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200130 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200131#define _PyUnicode_GET_LENGTH(op) \
132 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200133 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200134#define _PyUnicode_DATA_ANY(op) \
135 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200136
Victor Stinner910337b2011-10-03 03:20:16 +0200137#undef PyUnicode_READY
138#define PyUnicode_READY(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200141 0 : \
142 _PyUnicode_Ready((PyObject *)(op))))
Victor Stinner910337b2011-10-03 03:20:16 +0200143
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200144#define _PyUnicode_READY_REPLACE(p_obj) \
145 (assert(_PyUnicode_CHECK(*p_obj)), \
146 (PyUnicode_IS_READY(*p_obj) ? \
147 0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj))))
148
Victor Stinnerc379ead2011-10-03 12:52:27 +0200149#define _PyUnicode_SHARE_UTF8(op) \
150 (assert(_PyUnicode_CHECK(op)), \
151 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
152 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
153#define _PyUnicode_SHARE_WSTR(op) \
154 (assert(_PyUnicode_CHECK(op)), \
155 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
156
Victor Stinner829c0ad2011-10-03 01:08:02 +0200157/* true if the Unicode object has an allocated UTF-8 memory block
158 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200159#define _PyUnicode_HAS_UTF8_MEMORY(op) \
160 (assert(_PyUnicode_CHECK(op)), \
161 (!PyUnicode_IS_COMPACT_ASCII(op) \
162 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200163 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
164
Victor Stinner03490912011-10-03 23:45:12 +0200165/* true if the Unicode object has an allocated wstr memory block
166 (not shared with other data) */
167#define _PyUnicode_HAS_WSTR_MEMORY(op) \
168 (assert(_PyUnicode_CHECK(op)), \
169 (_PyUnicode_WSTR(op) && \
170 (!PyUnicode_IS_READY(op) || \
171 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
172
Victor Stinner910337b2011-10-03 03:20:16 +0200173/* Generic helper macro to convert characters of different types.
174 from_type and to_type have to be valid type names, begin and end
175 are pointers to the source characters which should be of type
176 "from_type *". to is a pointer of type "to_type *" and points to the
177 buffer where the result characters are written to. */
178#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
179 do { \
180 const from_type *iter_; to_type *to_; \
181 for (iter_ = (begin), to_ = (to_type *)(to); \
182 iter_ < (end); \
183 ++iter_, ++to_) { \
184 *to_ = (to_type)*iter_; \
185 } \
186 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200187
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200188/* The Unicode string has been modified: reset the hash */
189#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
190
Walter Dörwald16807132007-05-25 13:52:07 +0000191/* This dictionary holds all interned unicode strings. Note that references
192 to strings in this dictionary are *not* counted in the string's ob_refcnt.
193 When the interned string reaches a refcnt of 0 the string deallocation
194 function will delete the reference from this dictionary.
195
196 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000197 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000198*/
199static PyObject *interned;
200
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000201/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200202static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000203
204/* Single character Unicode strings in the Latin-1 range are being
205 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200206static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000207
Christian Heimes190d79e2008-01-30 11:58:22 +0000208/* Fast detection of the most frequent whitespace characters */
209const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000211/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000212/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000213/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000214/* case 0x000C: * FORM FEED */
215/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000216 0, 1, 1, 1, 1, 1, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000218/* case 0x001C: * FILE SEPARATOR */
219/* case 0x001D: * GROUP SEPARATOR */
220/* case 0x001E: * RECORD SEPARATOR */
221/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000222 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000223/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000224 1, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000228
Benjamin Peterson14339b62009-01-31 16:36:08 +0000229 0, 0, 0, 0, 0, 0, 0, 0,
230 0, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0,
232 0, 0, 0, 0, 0, 0, 0, 0,
233 0, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000237};
238
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200239/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200240static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200241static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200242static void copy_characters(
243 PyObject *to, Py_ssize_t to_start,
244 PyObject *from, Py_ssize_t from_start,
245 Py_ssize_t how_many);
Victor Stinnerc729b8e2011-10-06 02:36:59 +0200246#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200247static int unicode_is_singleton(PyObject *unicode);
Victor Stinnerc729b8e2011-10-06 02:36:59 +0200248#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200249
Alexander Belopolsky40018472011-02-26 01:02:56 +0000250static PyObject *
251unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000252 PyObject **errorHandler,const char *encoding, const char *reason,
253 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
254 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
255
Alexander Belopolsky40018472011-02-26 01:02:56 +0000256static void
257raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300258 const char *encoding,
259 const Py_UNICODE *unicode, Py_ssize_t size,
260 Py_ssize_t startpos, Py_ssize_t endpos,
261 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000262
Christian Heimes190d79e2008-01-30 11:58:22 +0000263/* Same for linebreaks */
264static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000265 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000266/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000267/* 0x000B, * LINE TABULATION */
268/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000269/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000270 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000271 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000272/* 0x001C, * FILE SEPARATOR */
273/* 0x001D, * GROUP SEPARATOR */
274/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000275 0, 0, 0, 0, 1, 1, 1, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
277 0, 0, 0, 0, 0, 0, 0, 0,
278 0, 0, 0, 0, 0, 0, 0, 0,
279 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000280
Benjamin Peterson14339b62009-01-31 16:36:08 +0000281 0, 0, 0, 0, 0, 0, 0, 0,
282 0, 0, 0, 0, 0, 0, 0, 0,
283 0, 0, 0, 0, 0, 0, 0, 0,
284 0, 0, 0, 0, 0, 0, 0, 0,
285 0, 0, 0, 0, 0, 0, 0, 0,
286 0, 0, 0, 0, 0, 0, 0, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000289};
290
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300291/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
292 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000293Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000294PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000295{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000296#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000297 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000298#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000299 /* This is actually an illegal character, so it should
300 not be passed to unichr. */
301 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000302#endif
303}
304
Victor Stinner910337b2011-10-03 03:20:16 +0200305#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200306int
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200307/* FIXME: use PyObject* type for op */
308_PyUnicode_CheckConsistency(void *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200309{
310 PyASCIIObject *ascii;
311 unsigned int kind;
312
313 assert(PyUnicode_Check(op));
314
315 ascii = (PyASCIIObject *)op;
316 kind = ascii->state.kind;
317
Victor Stinnera3b334d2011-10-03 13:53:37 +0200318 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200319 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200320 assert(ascii->state.ready == 1);
321 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200322 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200323 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200324 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200325
Victor Stinnera41463c2011-10-04 01:05:08 +0200326 if (ascii->state.compact == 1) {
327 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200328 assert(kind == PyUnicode_1BYTE_KIND
329 || kind == PyUnicode_2BYTE_KIND
330 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200331 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200332 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200333 assert (compact->utf8 != data);
334 } else {
335 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
336
337 data = unicode->data.any;
338 if (kind == PyUnicode_WCHAR_KIND) {
339 assert(ascii->state.compact == 0);
340 assert(ascii->state.ascii == 0);
341 assert(ascii->state.ready == 0);
342 assert(ascii->wstr != NULL);
343 assert(data == NULL);
344 assert(compact->utf8 == NULL);
345 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
346 }
347 else {
348 assert(kind == PyUnicode_1BYTE_KIND
349 || kind == PyUnicode_2BYTE_KIND
350 || kind == PyUnicode_4BYTE_KIND);
351 assert(ascii->state.compact == 0);
352 assert(ascii->state.ready == 1);
353 assert(data != NULL);
354 if (ascii->state.ascii) {
355 assert (compact->utf8 == data);
356 assert (compact->utf8_length == ascii->length);
357 }
358 else
359 assert (compact->utf8 != data);
360 }
361 }
362 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200363 if (
364#if SIZEOF_WCHAR_T == 2
365 kind == PyUnicode_2BYTE_KIND
366#else
367 kind == PyUnicode_4BYTE_KIND
368#endif
369 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200370 {
371 assert(ascii->wstr == data);
372 assert(compact->wstr_length == ascii->length);
373 } else
374 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200375 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200376
377 if (compact->utf8 == NULL)
378 assert(compact->utf8_length == 0);
379 if (ascii->wstr == NULL)
380 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200381 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200382 /* check that the best kind is used */
383 if (check_content && kind != PyUnicode_WCHAR_KIND)
384 {
385 Py_ssize_t i;
386 Py_UCS4 maxchar = 0;
387 void *data = PyUnicode_DATA(ascii);
388 for (i=0; i < ascii->length; i++)
389 {
390 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
391 if (ch > maxchar)
392 maxchar = ch;
393 }
394 if (kind == PyUnicode_1BYTE_KIND) {
395 if (ascii->state.ascii == 0)
396 assert(maxchar >= 128);
397 else
398 assert(maxchar < 128);
399 }
400 else if (kind == PyUnicode_2BYTE_KIND)
401 assert(maxchar >= 0x100);
402 else
403 assert(maxchar >= 0x10000);
404 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200405 if (check_content && !unicode_is_singleton((PyObject*)ascii))
406 assert(ascii->hash == -1);
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400407 return 1;
408}
Victor Stinner910337b2011-10-03 03:20:16 +0200409#endif
410
Thomas Wouters477c8d52006-05-27 19:21:47 +0000411/* --- Bloom Filters ----------------------------------------------------- */
412
413/* stuff to implement simple "bloom filters" for Unicode characters.
414 to keep things simple, we use a single bitmask, using the least 5
415 bits from each unicode characters as the bit index. */
416
417/* the linebreak mask is set up by Unicode_Init below */
418
Antoine Pitrouf068f942010-01-13 14:19:12 +0000419#if LONG_BIT >= 128
420#define BLOOM_WIDTH 128
421#elif LONG_BIT >= 64
422#define BLOOM_WIDTH 64
423#elif LONG_BIT >= 32
424#define BLOOM_WIDTH 32
425#else
426#error "LONG_BIT is smaller than 32"
427#endif
428
Thomas Wouters477c8d52006-05-27 19:21:47 +0000429#define BLOOM_MASK unsigned long
430
431static BLOOM_MASK bloom_linebreak;
432
Antoine Pitrouf068f942010-01-13 14:19:12 +0000433#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
434#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000435
Benjamin Peterson29060642009-01-31 22:14:21 +0000436#define BLOOM_LINEBREAK(ch) \
437 ((ch) < 128U ? ascii_linebreak[(ch)] : \
438 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000439
Alexander Belopolsky40018472011-02-26 01:02:56 +0000440Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200441make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000442{
443 /* calculate simple bloom-style bitmask for a given unicode string */
444
Antoine Pitrouf068f942010-01-13 14:19:12 +0000445 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000446 Py_ssize_t i;
447
448 mask = 0;
449 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200450 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000451
452 return mask;
453}
454
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200455#define BLOOM_MEMBER(mask, chr, str) \
456 (BLOOM(mask, chr) \
457 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000458
Guido van Rossumd57fd912000-03-10 22:53:23 +0000459/* --- Unicode Object ----------------------------------------------------- */
460
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200461static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200462fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200463
464Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
465 Py_ssize_t size, Py_UCS4 ch,
466 int direction)
467{
468 /* like wcschr, but doesn't stop at NULL characters */
469 Py_ssize_t i;
470 if (direction == 1) {
471 for(i = 0; i < size; i++)
472 if (PyUnicode_READ(kind, s, i) == ch)
473 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
474 }
475 else {
476 for(i = size-1; i >= 0; i--)
477 if (PyUnicode_READ(kind, s, i) == ch)
478 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
479 }
480 return NULL;
481}
482
Victor Stinnerfe226c02011-10-03 03:52:20 +0200483static PyObject*
484resize_compact(PyObject *unicode, Py_ssize_t length)
485{
486 Py_ssize_t char_size;
487 Py_ssize_t struct_size;
488 Py_ssize_t new_size;
489 int share_wstr;
490
491 assert(PyUnicode_IS_READY(unicode));
492 char_size = PyUnicode_CHARACTER_SIZE(unicode);
493 if (PyUnicode_IS_COMPACT_ASCII(unicode))
494 struct_size = sizeof(PyASCIIObject);
495 else
496 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200497 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200498
499 _Py_DEC_REFTOTAL;
500 _Py_ForgetReference(unicode);
501
502 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
503 PyErr_NoMemory();
504 return NULL;
505 }
506 new_size = (struct_size + (length + 1) * char_size);
507
508 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
509 if (unicode == NULL) {
510 PyObject_Del(unicode);
511 PyErr_NoMemory();
512 return NULL;
513 }
514 _Py_NewReference(unicode);
515 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200516 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200517 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200518 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
519 _PyUnicode_WSTR_LENGTH(unicode) = length;
520 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200521 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
522 length, 0);
523 return unicode;
524}
525
Alexander Belopolsky40018472011-02-26 01:02:56 +0000526static int
Victor Stinner95663112011-10-04 01:03:50 +0200527resize_inplace(PyUnicodeObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000528{
Victor Stinner95663112011-10-04 01:03:50 +0200529 wchar_t *wstr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200530 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200531 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000532
Victor Stinner95663112011-10-04 01:03:50 +0200533 _PyUnicode_DIRTY(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200534
535 if (PyUnicode_IS_READY(unicode)) {
536 Py_ssize_t char_size;
537 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200538 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200539 void *data;
540
541 data = _PyUnicode_DATA_ANY(unicode);
542 assert(data != NULL);
543 char_size = PyUnicode_CHARACTER_SIZE(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200544 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
545 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinner95663112011-10-04 01:03:50 +0200546 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
547 {
548 PyObject_DEL(_PyUnicode_UTF8(unicode));
549 _PyUnicode_UTF8(unicode) = NULL;
550 _PyUnicode_UTF8_LENGTH(unicode) = 0;
551 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200552
553 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
554 PyErr_NoMemory();
555 return -1;
556 }
557 new_size = (length + 1) * char_size;
558
559 data = (PyObject *)PyObject_REALLOC(data, new_size);
560 if (data == NULL) {
561 PyErr_NoMemory();
562 return -1;
563 }
564 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200565 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200566 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200567 _PyUnicode_WSTR_LENGTH(unicode) = length;
568 }
569 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200570 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200571 _PyUnicode_UTF8_LENGTH(unicode) = length;
572 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200573 _PyUnicode_LENGTH(unicode) = length;
574 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200575 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200576 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200577 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200578 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200579 }
Victor Stinner95663112011-10-04 01:03:50 +0200580 assert(_PyUnicode_WSTR(unicode) != NULL);
581
582 /* check for integer overflow */
583 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
584 PyErr_NoMemory();
585 return -1;
586 }
587 wstr = _PyUnicode_WSTR(unicode);
588 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
589 if (!wstr) {
590 PyErr_NoMemory();
591 return -1;
592 }
593 _PyUnicode_WSTR(unicode) = wstr;
594 _PyUnicode_WSTR(unicode)[length] = 0;
595 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200596 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000597 return 0;
598}
599
Victor Stinnerfe226c02011-10-03 03:52:20 +0200600static PyObject*
601resize_copy(PyObject *unicode, Py_ssize_t length)
602{
603 Py_ssize_t copy_length;
604 if (PyUnicode_IS_COMPACT(unicode)) {
605 PyObject *copy;
606 assert(PyUnicode_IS_READY(unicode));
607
608 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
609 if (copy == NULL)
610 return NULL;
611
612 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200613 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200614 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200615 }
616 else {
Victor Stinner2fd82272011-10-03 04:06:05 +0200617 PyUnicodeObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200618 assert(_PyUnicode_WSTR(unicode) != NULL);
619 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner2fd82272011-10-03 04:06:05 +0200620 w = _PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200621 if (w == NULL)
622 return NULL;
623 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
624 copy_length = Py_MIN(copy_length, length);
625 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
626 copy_length);
627 return (PyObject*)w;
628 }
629}
630
Guido van Rossumd57fd912000-03-10 22:53:23 +0000631/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000632 Ux0000 terminated; some code (e.g. new_identifier)
633 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000634
635 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000636 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000637
638*/
639
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200640#ifdef Py_DEBUG
641int unicode_old_new_calls = 0;
642#endif
643
Alexander Belopolsky40018472011-02-26 01:02:56 +0000644static PyUnicodeObject *
645_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000646{
647 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200648 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000649
Thomas Wouters477c8d52006-05-27 19:21:47 +0000650 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000651 if (length == 0 && unicode_empty != NULL) {
652 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200653 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000654 }
655
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000656 /* Ensure we won't overflow the size. */
657 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
658 return (PyUnicodeObject *)PyErr_NoMemory();
659 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200660 if (length < 0) {
661 PyErr_SetString(PyExc_SystemError,
662 "Negative size passed to _PyUnicode_New");
663 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000664 }
665
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200666#ifdef Py_DEBUG
667 ++unicode_old_new_calls;
668#endif
669
670 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
671 if (unicode == NULL)
672 return NULL;
673 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
674 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
675 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000676 PyErr_NoMemory();
677 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000678 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200679
Jeremy Hyltond8082792003-09-16 19:41:39 +0000680 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000681 * the caller fails before initializing str -- unicode_resize()
682 * reads str[0], and the Keep-Alive optimization can keep memory
683 * allocated for str alive across a call to unicode_dealloc(unicode).
684 * We don't want unicode_resize to read uninitialized memory in
685 * that case.
686 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200687 _PyUnicode_WSTR(unicode)[0] = 0;
688 _PyUnicode_WSTR(unicode)[length] = 0;
689 _PyUnicode_WSTR_LENGTH(unicode) = length;
690 _PyUnicode_HASH(unicode) = -1;
691 _PyUnicode_STATE(unicode).interned = 0;
692 _PyUnicode_STATE(unicode).kind = 0;
693 _PyUnicode_STATE(unicode).compact = 0;
694 _PyUnicode_STATE(unicode).ready = 0;
695 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200696 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200697 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200698 _PyUnicode_UTF8(unicode) = NULL;
699 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000700 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000701
Benjamin Peterson29060642009-01-31 22:14:21 +0000702 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000703 /* XXX UNREF/NEWREF interface should be more symmetrical */
704 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000705 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000706 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000707 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000708}
709
Victor Stinnerf42dc442011-10-02 23:33:16 +0200710static const char*
711unicode_kind_name(PyObject *unicode)
712{
Victor Stinner42dfd712011-10-03 14:41:45 +0200713 /* don't check consistency: unicode_kind_name() is called from
714 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200715 if (!PyUnicode_IS_COMPACT(unicode))
716 {
717 if (!PyUnicode_IS_READY(unicode))
718 return "wstr";
719 switch(PyUnicode_KIND(unicode))
720 {
721 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200722 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200723 return "legacy ascii";
724 else
725 return "legacy latin1";
726 case PyUnicode_2BYTE_KIND:
727 return "legacy UCS2";
728 case PyUnicode_4BYTE_KIND:
729 return "legacy UCS4";
730 default:
731 return "<legacy invalid kind>";
732 }
733 }
734 assert(PyUnicode_IS_READY(unicode));
735 switch(PyUnicode_KIND(unicode))
736 {
737 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200738 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200739 return "ascii";
740 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200741 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200742 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200743 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200744 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200745 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200746 default:
747 return "<invalid compact kind>";
748 }
749}
750
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200751#ifdef Py_DEBUG
752int unicode_new_new_calls = 0;
753
754/* Functions wrapping macros for use in debugger */
755char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200756 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200757}
758
759void *_PyUnicode_compact_data(void *unicode) {
760 return _PyUnicode_COMPACT_DATA(unicode);
761}
762void *_PyUnicode_data(void *unicode){
763 printf("obj %p\n", unicode);
764 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
765 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
766 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
767 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
768 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
769 return PyUnicode_DATA(unicode);
770}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200771
772void
773_PyUnicode_Dump(PyObject *op)
774{
775 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200776 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
777 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
778 void *data;
779 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
780 if (ascii->state.compact)
781 data = (compact + 1);
782 else
783 data = unicode->data.any;
784 if (ascii->wstr == data)
785 printf("shared ");
786 printf("wstr=%p", ascii->wstr);
Victor Stinnera3b334d2011-10-03 13:53:37 +0200787 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200788 printf(" (%zu), ", compact->wstr_length);
789 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
790 printf("shared ");
791 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200792 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200793 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200794}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200795#endif
796
797PyObject *
798PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
799{
800 PyObject *obj;
801 PyCompactUnicodeObject *unicode;
802 void *data;
803 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200804 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200805 Py_ssize_t char_size;
806 Py_ssize_t struct_size;
807
808 /* Optimization for empty strings */
809 if (size == 0 && unicode_empty != NULL) {
810 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200811 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200812 }
813
814#ifdef Py_DEBUG
815 ++unicode_new_new_calls;
816#endif
817
Victor Stinner9e9d6892011-10-04 01:02:02 +0200818 is_ascii = 0;
819 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200820 struct_size = sizeof(PyCompactUnicodeObject);
821 if (maxchar < 128) {
822 kind_state = PyUnicode_1BYTE_KIND;
823 char_size = 1;
824 is_ascii = 1;
825 struct_size = sizeof(PyASCIIObject);
826 }
827 else if (maxchar < 256) {
828 kind_state = PyUnicode_1BYTE_KIND;
829 char_size = 1;
830 }
831 else if (maxchar < 65536) {
832 kind_state = PyUnicode_2BYTE_KIND;
833 char_size = 2;
834 if (sizeof(wchar_t) == 2)
835 is_sharing = 1;
836 }
837 else {
838 kind_state = PyUnicode_4BYTE_KIND;
839 char_size = 4;
840 if (sizeof(wchar_t) == 4)
841 is_sharing = 1;
842 }
843
844 /* Ensure we won't overflow the size. */
845 if (size < 0) {
846 PyErr_SetString(PyExc_SystemError,
847 "Negative size passed to PyUnicode_New");
848 return NULL;
849 }
850 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
851 return PyErr_NoMemory();
852
853 /* Duplicated allocation code from _PyObject_New() instead of a call to
854 * PyObject_New() so we are able to allocate space for the object and
855 * it's data buffer.
856 */
857 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
858 if (obj == NULL)
859 return PyErr_NoMemory();
860 obj = PyObject_INIT(obj, &PyUnicode_Type);
861 if (obj == NULL)
862 return NULL;
863
864 unicode = (PyCompactUnicodeObject *)obj;
865 if (is_ascii)
866 data = ((PyASCIIObject*)obj) + 1;
867 else
868 data = unicode + 1;
869 _PyUnicode_LENGTH(unicode) = size;
870 _PyUnicode_HASH(unicode) = -1;
871 _PyUnicode_STATE(unicode).interned = 0;
872 _PyUnicode_STATE(unicode).kind = kind_state;
873 _PyUnicode_STATE(unicode).compact = 1;
874 _PyUnicode_STATE(unicode).ready = 1;
875 _PyUnicode_STATE(unicode).ascii = is_ascii;
876 if (is_ascii) {
877 ((char*)data)[size] = 0;
878 _PyUnicode_WSTR(unicode) = NULL;
879 }
880 else if (kind_state == PyUnicode_1BYTE_KIND) {
881 ((char*)data)[size] = 0;
882 _PyUnicode_WSTR(unicode) = NULL;
883 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200884 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200885 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200886 }
887 else {
888 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200889 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200890 if (kind_state == PyUnicode_2BYTE_KIND)
891 ((Py_UCS2*)data)[size] = 0;
892 else /* kind_state == PyUnicode_4BYTE_KIND */
893 ((Py_UCS4*)data)[size] = 0;
894 if (is_sharing) {
895 _PyUnicode_WSTR_LENGTH(unicode) = size;
896 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
897 }
898 else {
899 _PyUnicode_WSTR_LENGTH(unicode) = 0;
900 _PyUnicode_WSTR(unicode) = NULL;
901 }
902 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200903 assert(_PyUnicode_CheckConsistency(unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200904 return obj;
905}
906
907#if SIZEOF_WCHAR_T == 2
908/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
909 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +0200910 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200911
912 This function assumes that unicode can hold one more code point than wstr
913 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +0200914static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200915unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
916 PyUnicodeObject *unicode)
917{
918 const wchar_t *iter;
919 Py_UCS4 *ucs4_out;
920
Victor Stinner910337b2011-10-03 03:20:16 +0200921 assert(unicode != NULL);
922 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200923 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
924 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
925
926 for (iter = begin; iter < end; ) {
927 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
928 _PyUnicode_GET_LENGTH(unicode)));
929 if (*iter >= 0xD800 && *iter <= 0xDBFF
930 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
931 {
932 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
933 iter += 2;
934 }
935 else {
936 *ucs4_out++ = *iter;
937 iter++;
938 }
939 }
940 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
941 _PyUnicode_GET_LENGTH(unicode)));
942
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200943}
944#endif
945
Victor Stinnercd9950f2011-10-02 00:34:53 +0200946static int
947_PyUnicode_Dirty(PyObject *unicode)
948{
Victor Stinner910337b2011-10-03 03:20:16 +0200949 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +0200950 if (Py_REFCNT(unicode) != 1) {
Victor Stinner01698042011-10-04 00:04:26 +0200951 PyErr_SetString(PyExc_SystemError,
Victor Stinnercd9950f2011-10-02 00:34:53 +0200952 "Cannot modify a string having more than 1 reference");
953 return -1;
954 }
955 _PyUnicode_DIRTY(unicode);
956 return 0;
957}
958
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200959static int
960_copy_characters(PyObject *to, Py_ssize_t to_start,
961 PyObject *from, Py_ssize_t from_start,
962 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200963{
Victor Stinnera0702ab2011-09-29 14:14:38 +0200964 unsigned int from_kind, to_kind;
965 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200966 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200967
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200968 assert(PyUnicode_Check(from));
969 assert(PyUnicode_Check(to));
970 assert(PyUnicode_IS_READY(from));
971 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200972
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200973 assert(PyUnicode_GET_LENGTH(from) >= how_many);
974 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
975 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200976
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200977 if (how_many == 0)
978 return 0;
979
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200980 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200981 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200982 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200983 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200984
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200985#ifdef Py_DEBUG
986 if (!check_maxchar
987 && (from_kind > to_kind
988 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200989 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200990 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
991 Py_UCS4 ch;
992 Py_ssize_t i;
993 for (i=0; i < how_many; i++) {
994 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
995 assert(ch <= to_maxchar);
996 }
997 }
998#endif
999 fast = (from_kind == to_kind);
1000 if (check_maxchar
1001 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1002 {
1003 /* deny latin1 => ascii */
1004 fast = 0;
1005 }
1006
1007 if (fast) {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001008 Py_MEMCPY((char*)to_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001009 + PyUnicode_KIND_SIZE(to_kind, to_start),
Victor Stinnera0702ab2011-09-29 14:14:38 +02001010 (char*)from_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001011 + PyUnicode_KIND_SIZE(from_kind, from_start),
1012 PyUnicode_KIND_SIZE(to_kind, how_many));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001013 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001014 else if (from_kind == PyUnicode_1BYTE_KIND
1015 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001016 {
1017 _PyUnicode_CONVERT_BYTES(
1018 Py_UCS1, Py_UCS2,
1019 PyUnicode_1BYTE_DATA(from) + from_start,
1020 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1021 PyUnicode_2BYTE_DATA(to) + to_start
1022 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001023 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001024 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001025 && to_kind == PyUnicode_4BYTE_KIND)
1026 {
1027 _PyUnicode_CONVERT_BYTES(
1028 Py_UCS1, Py_UCS4,
1029 PyUnicode_1BYTE_DATA(from) + from_start,
1030 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1031 PyUnicode_4BYTE_DATA(to) + to_start
1032 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001033 }
1034 else if (from_kind == PyUnicode_2BYTE_KIND
1035 && to_kind == PyUnicode_4BYTE_KIND)
1036 {
1037 _PyUnicode_CONVERT_BYTES(
1038 Py_UCS2, Py_UCS4,
1039 PyUnicode_2BYTE_DATA(from) + from_start,
1040 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1041 PyUnicode_4BYTE_DATA(to) + to_start
1042 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001043 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001044 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001045 /* check if max_char(from substring) <= max_char(to) */
1046 if (from_kind > to_kind
1047 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001048 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001049 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001050 /* slow path to check for character overflow */
1051 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001052 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001053 Py_ssize_t i;
1054
Victor Stinner56c161a2011-10-06 02:47:11 +02001055#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001056 for (i=0; i < how_many; i++) {
1057 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001058 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001059 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1060 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001061#else
1062 if (!check_maxchar) {
1063 for (i=0; i < how_many; i++) {
1064 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1065 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1066 }
1067 }
1068 else {
1069 for (i=0; i < how_many; i++) {
1070 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1071 if (ch > to_maxchar)
1072 return 1;
1073 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1074 }
1075 }
1076#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001077 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001078 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001079 assert(0 && "inconsistent state");
1080 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001081 }
1082 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001083 return 0;
1084}
1085
1086static void
1087copy_characters(PyObject *to, Py_ssize_t to_start,
1088 PyObject *from, Py_ssize_t from_start,
1089 Py_ssize_t how_many)
1090{
1091 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1092}
1093
1094Py_ssize_t
1095PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1096 PyObject *from, Py_ssize_t from_start,
1097 Py_ssize_t how_many)
1098{
1099 int err;
1100
1101 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1102 PyErr_BadInternalCall();
1103 return -1;
1104 }
1105
1106 if (PyUnicode_READY(from))
1107 return -1;
1108 if (PyUnicode_READY(to))
1109 return -1;
1110
1111 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1112 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1113 PyErr_Format(PyExc_SystemError,
1114 "Cannot write %zi characters at %zi "
1115 "in a string of %zi characters",
1116 how_many, to_start, PyUnicode_GET_LENGTH(to));
1117 return -1;
1118 }
1119
1120 if (how_many == 0)
1121 return 0;
1122
1123 if (_PyUnicode_Dirty(to))
1124 return -1;
1125
1126 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1127 if (err) {
1128 PyErr_Format(PyExc_SystemError,
1129 "Cannot copy %s characters "
1130 "into a string of %s characters",
1131 unicode_kind_name(from),
1132 unicode_kind_name(to));
1133 return -1;
1134 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001135 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001136}
1137
Victor Stinner17222162011-09-28 22:15:37 +02001138/* Find the maximum code point and count the number of surrogate pairs so a
1139 correct string length can be computed before converting a string to UCS4.
1140 This function counts single surrogates as a character and not as a pair.
1141
1142 Return 0 on success, or -1 on error. */
1143static int
1144find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1145 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001146{
1147 const wchar_t *iter;
1148
Victor Stinnerc53be962011-10-02 21:33:54 +02001149 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001150 *num_surrogates = 0;
1151 *maxchar = 0;
1152
1153 for (iter = begin; iter < end; ) {
Victor Stinnerae864852011-10-05 14:02:44 +02001154 if (*iter > *maxchar) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001155 *maxchar = *iter;
Victor Stinnerae864852011-10-05 14:02:44 +02001156#if SIZEOF_WCHAR_T != 2
1157 if (*maxchar >= 0x10000)
1158 return 0;
1159#endif
1160 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001161#if SIZEOF_WCHAR_T == 2
1162 if (*iter >= 0xD800 && *iter <= 0xDBFF
1163 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1164 {
1165 Py_UCS4 surrogate_val;
1166 surrogate_val = (((iter[0] & 0x3FF)<<10)
1167 | (iter[1] & 0x3FF)) + 0x10000;
1168 ++(*num_surrogates);
1169 if (surrogate_val > *maxchar)
1170 *maxchar = surrogate_val;
1171 iter += 2;
1172 }
1173 else
1174 iter++;
1175#else
1176 iter++;
1177#endif
1178 }
1179 return 0;
1180}
1181
1182#ifdef Py_DEBUG
1183int unicode_ready_calls = 0;
1184#endif
1185
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001186static int
1187unicode_ready(PyObject **p_obj, int replace)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001188{
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001189 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001190 wchar_t *end;
1191 Py_UCS4 maxchar = 0;
1192 Py_ssize_t num_surrogates;
1193#if SIZEOF_WCHAR_T == 2
1194 Py_ssize_t length_wo_surrogates;
1195#endif
1196
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001197 assert(p_obj != NULL);
1198 unicode = (PyUnicodeObject *)*p_obj;
1199
Georg Brandl7597add2011-10-05 16:36:47 +02001200 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001201 strings were created using _PyObject_New() and where no canonical
1202 representation (the str field) has been set yet aka strings
1203 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001204 assert(_PyUnicode_CHECK(unicode));
1205 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001206 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001207 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001208 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001209 /* Actually, it should neither be interned nor be anything else: */
1210 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001211
1212#ifdef Py_DEBUG
1213 ++unicode_ready_calls;
1214#endif
1215
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001216#ifdef Py_DEBUG
1217 assert(!replace || Py_REFCNT(unicode) == 1);
1218#else
1219 if (replace && Py_REFCNT(unicode) != 1)
1220 replace = 0;
1221#endif
1222 if (replace) {
1223 Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode);
1224 wchar_t *wstr = _PyUnicode_WSTR(unicode);
1225 /* Optimization for empty strings */
1226 if (len == 0) {
1227 Py_INCREF(unicode_empty);
1228 Py_DECREF(*p_obj);
1229 *p_obj = unicode_empty;
1230 return 0;
1231 }
1232 if (len == 1 && wstr[0] < 256) {
1233 PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]);
1234 if (latin1_char == NULL)
1235 return -1;
1236 Py_DECREF(*p_obj);
1237 *p_obj = latin1_char;
1238 return 0;
1239 }
1240 }
1241
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001242 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001243 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001244 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001245 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001246
1247 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001248 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1249 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001250 PyErr_NoMemory();
1251 return -1;
1252 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001253 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001254 _PyUnicode_WSTR(unicode), end,
1255 PyUnicode_1BYTE_DATA(unicode));
1256 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1257 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1258 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1259 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001260 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001261 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001262 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001263 }
1264 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001265 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001266 _PyUnicode_UTF8(unicode) = NULL;
1267 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001268 }
1269 PyObject_FREE(_PyUnicode_WSTR(unicode));
1270 _PyUnicode_WSTR(unicode) = NULL;
1271 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1272 }
1273 /* In this case we might have to convert down from 4-byte native
1274 wchar_t to 2-byte unicode. */
1275 else if (maxchar < 65536) {
1276 assert(num_surrogates == 0 &&
1277 "FindMaxCharAndNumSurrogatePairs() messed up");
1278
Victor Stinner506f5922011-09-28 22:34:18 +02001279#if SIZEOF_WCHAR_T == 2
1280 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001281 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001282 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1283 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1284 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001285 _PyUnicode_UTF8(unicode) = NULL;
1286 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001287#else
1288 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001289 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001290 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001291 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001292 PyErr_NoMemory();
1293 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001294 }
Victor Stinner506f5922011-09-28 22:34:18 +02001295 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1296 _PyUnicode_WSTR(unicode), end,
1297 PyUnicode_2BYTE_DATA(unicode));
1298 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1299 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1300 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001301 _PyUnicode_UTF8(unicode) = NULL;
1302 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001303 PyObject_FREE(_PyUnicode_WSTR(unicode));
1304 _PyUnicode_WSTR(unicode) = NULL;
1305 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1306#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001307 }
1308 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1309 else {
1310#if SIZEOF_WCHAR_T == 2
1311 /* in case the native representation is 2-bytes, we need to allocate a
1312 new normalized 4-byte version. */
1313 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001314 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1315 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001316 PyErr_NoMemory();
1317 return -1;
1318 }
1319 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1320 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001321 _PyUnicode_UTF8(unicode) = NULL;
1322 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001323 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1324 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001325 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001326 PyObject_FREE(_PyUnicode_WSTR(unicode));
1327 _PyUnicode_WSTR(unicode) = NULL;
1328 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1329#else
1330 assert(num_surrogates == 0);
1331
Victor Stinnerc3c74152011-10-02 20:39:55 +02001332 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001333 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001334 _PyUnicode_UTF8(unicode) = NULL;
1335 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001336 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1337#endif
1338 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1339 }
1340 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001341 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001342 return 0;
1343}
1344
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001345int
1346_PyUnicode_ReadyReplace(PyObject **op)
1347{
1348 return unicode_ready(op, 1);
1349}
1350
1351int
1352_PyUnicode_Ready(PyObject *op)
1353{
1354 return unicode_ready(&op, 0);
1355}
1356
Alexander Belopolsky40018472011-02-26 01:02:56 +00001357static void
1358unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001359{
Walter Dörwald16807132007-05-25 13:52:07 +00001360 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001361 case SSTATE_NOT_INTERNED:
1362 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001363
Benjamin Peterson29060642009-01-31 22:14:21 +00001364 case SSTATE_INTERNED_MORTAL:
1365 /* revive dead object temporarily for DelItem */
1366 Py_REFCNT(unicode) = 3;
1367 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
1368 Py_FatalError(
1369 "deletion of interned string failed");
1370 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001371
Benjamin Peterson29060642009-01-31 22:14:21 +00001372 case SSTATE_INTERNED_IMMORTAL:
1373 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001374
Benjamin Peterson29060642009-01-31 22:14:21 +00001375 default:
1376 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001377 }
1378
Victor Stinner03490912011-10-03 23:45:12 +02001379 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001380 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001381 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001382 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001383
1384 if (PyUnicode_IS_COMPACT(unicode)) {
1385 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001386 }
1387 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001388 if (_PyUnicode_DATA_ANY(unicode))
1389 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Benjamin Peterson29060642009-01-31 22:14:21 +00001390 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001391 }
1392}
1393
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001394#ifdef Py_DEBUG
1395static int
1396unicode_is_singleton(PyObject *unicode)
1397{
1398 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1399 if (unicode == unicode_empty)
1400 return 1;
1401 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1402 {
1403 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1404 if (ch < 256 && unicode_latin1[ch] == unicode)
1405 return 1;
1406 }
1407 return 0;
1408}
1409#endif
1410
Alexander Belopolsky40018472011-02-26 01:02:56 +00001411static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001412unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001413{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001414 if (Py_REFCNT(unicode) != 1)
1415 return 0;
1416 if (PyUnicode_CHECK_INTERNED(unicode))
1417 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001418#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001419 /* singleton refcount is greater than 1 */
1420 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001421#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001422 return 1;
1423}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001424
Victor Stinnerfe226c02011-10-03 03:52:20 +02001425static int
1426unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1427{
1428 PyObject *unicode;
1429 Py_ssize_t old_length;
1430
1431 assert(p_unicode != NULL);
1432 unicode = *p_unicode;
1433
1434 assert(unicode != NULL);
1435 assert(PyUnicode_Check(unicode));
1436 assert(0 <= length);
1437
Victor Stinner910337b2011-10-03 03:20:16 +02001438 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001439 old_length = PyUnicode_WSTR_LENGTH(unicode);
1440 else
1441 old_length = PyUnicode_GET_LENGTH(unicode);
1442 if (old_length == length)
1443 return 0;
1444
Victor Stinnerfe226c02011-10-03 03:52:20 +02001445 if (!unicode_resizable(unicode)) {
1446 PyObject *copy = resize_copy(unicode, length);
1447 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001448 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001449 Py_DECREF(*p_unicode);
1450 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001451 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001452 }
1453
Victor Stinnerfe226c02011-10-03 03:52:20 +02001454 if (PyUnicode_IS_COMPACT(unicode)) {
1455 *p_unicode = resize_compact(unicode, length);
1456 if (*p_unicode == NULL)
1457 return -1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001458 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001459 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001460 }
1461 return resize_inplace((PyUnicodeObject*)unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001462}
1463
Alexander Belopolsky40018472011-02-26 01:02:56 +00001464int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001465PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001466{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001467 PyObject *unicode;
1468 if (p_unicode == NULL) {
1469 PyErr_BadInternalCall();
1470 return -1;
1471 }
1472 unicode = *p_unicode;
1473 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0
1474 || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND)
1475 {
1476 PyErr_BadInternalCall();
1477 return -1;
1478 }
1479 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001480}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001481
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001482static PyObject*
1483get_latin1_char(unsigned char ch)
1484{
Victor Stinnera464fc12011-10-02 20:39:30 +02001485 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001486 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001487 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001488 if (!unicode)
1489 return NULL;
1490 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001491 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001492 unicode_latin1[ch] = unicode;
1493 }
1494 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001495 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001496}
1497
Alexander Belopolsky40018472011-02-26 01:02:56 +00001498PyObject *
1499PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001500{
1501 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001502 Py_UCS4 maxchar = 0;
1503 Py_ssize_t num_surrogates;
1504
1505 if (u == NULL)
1506 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001507
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001508 /* If the Unicode data is known at construction time, we can apply
1509 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001510
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001511 /* Optimization for empty strings */
1512 if (size == 0 && unicode_empty != NULL) {
1513 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001514 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001515 }
Tim Petersced69f82003-09-16 20:30:58 +00001516
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001517 /* Single character Unicode objects in the Latin-1 range are
1518 shared when using this constructor */
1519 if (size == 1 && *u < 256)
1520 return get_latin1_char((unsigned char)*u);
1521
1522 /* If not empty and not single character, copy the Unicode data
1523 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001524 if (find_maxchar_surrogates(u, u + size,
1525 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001526 return NULL;
1527
1528 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1529 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001530 if (!unicode)
1531 return NULL;
1532
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001533 switch (PyUnicode_KIND(unicode)) {
1534 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001535 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001536 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1537 break;
1538 case PyUnicode_2BYTE_KIND:
1539#if Py_UNICODE_SIZE == 2
1540 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1541#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001542 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001543 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1544#endif
1545 break;
1546 case PyUnicode_4BYTE_KIND:
1547#if SIZEOF_WCHAR_T == 2
1548 /* This is the only case which has to process surrogates, thus
1549 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001550 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001551#else
1552 assert(num_surrogates == 0);
1553 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1554#endif
1555 break;
1556 default:
1557 assert(0 && "Impossible state");
1558 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001559
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001560 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001561 return (PyObject *)unicode;
1562}
1563
Alexander Belopolsky40018472011-02-26 01:02:56 +00001564PyObject *
1565PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001566{
1567 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001568
Benjamin Peterson14339b62009-01-31 16:36:08 +00001569 if (size < 0) {
1570 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001571 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001572 return NULL;
1573 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001574
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001575 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001576 some optimizations which share commonly used objects.
1577 Also, this means the input must be UTF-8, so fall back to the
1578 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001579 if (u != NULL) {
1580
Benjamin Peterson29060642009-01-31 22:14:21 +00001581 /* Optimization for empty strings */
1582 if (size == 0 && unicode_empty != NULL) {
1583 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001584 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001585 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001586
1587 /* Single characters are shared when using this constructor.
1588 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001589 if (size == 1 && Py_CHARMASK(*u) < 128)
1590 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001591
1592 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001593 }
1594
Walter Dörwald55507312007-05-18 13:12:10 +00001595 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001596 if (!unicode)
1597 return NULL;
1598
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001599 return (PyObject *)unicode;
1600}
1601
Alexander Belopolsky40018472011-02-26 01:02:56 +00001602PyObject *
1603PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001604{
1605 size_t size = strlen(u);
1606 if (size > PY_SSIZE_T_MAX) {
1607 PyErr_SetString(PyExc_OverflowError, "input too long");
1608 return NULL;
1609 }
1610
1611 return PyUnicode_FromStringAndSize(u, size);
1612}
1613
Victor Stinnere57b1c02011-09-28 22:20:48 +02001614static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001615unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001616{
Victor Stinner0617b6e2011-10-05 23:26:01 +02001617 PyObject *res;
1618#ifdef Py_DEBUG
1619 const unsigned char *p;
1620 const unsigned char *end = s + size;
1621 for (p=s; p < end; p++) {
1622 assert(*p < 128);
1623 }
1624#endif
1625 res = PyUnicode_New(size, 127);
Victor Stinner702c7342011-10-05 13:50:52 +02001626 if (!res)
1627 return NULL;
Victor Stinner0617b6e2011-10-05 23:26:01 +02001628 memcpy(PyUnicode_1BYTE_DATA(res), s, size);
Victor Stinner702c7342011-10-05 13:50:52 +02001629 return res;
1630}
1631
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001632static Py_UCS4
1633kind_maxchar_limit(unsigned int kind)
1634{
1635 switch(kind) {
1636 case PyUnicode_1BYTE_KIND:
1637 return 0x80;
1638 case PyUnicode_2BYTE_KIND:
1639 return 0x100;
1640 case PyUnicode_4BYTE_KIND:
1641 return 0x10000;
1642 default:
1643 assert(0 && "invalid kind");
1644 return 0x10ffff;
1645 }
1646}
1647
Victor Stinner702c7342011-10-05 13:50:52 +02001648static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001649_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001650{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001651 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001652 unsigned char max_char = 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001653 Py_ssize_t i;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001654
1655 assert(size >= 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001656 for (i = 0; i < size; i++) {
1657 if (u[i] & 0x80) {
Victor Stinnerb9275c12011-10-05 14:01:42 +02001658 max_char = 255;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001659 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001660 }
1661 }
Victor Stinnerb9275c12011-10-05 14:01:42 +02001662 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001663 if (!res)
1664 return NULL;
1665 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001666 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001667 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001668}
1669
Victor Stinnere57b1c02011-09-28 22:20:48 +02001670static PyObject*
1671_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001672{
1673 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001674 Py_UCS2 max_char = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001675 Py_ssize_t i;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001676
1677 assert(size >= 0);
1678 for (i = 0; i < size; i++) {
1679 if (u[i] > max_char) {
1680 max_char = u[i];
1681 if (max_char >= 256)
1682 break;
1683 }
1684 }
1685 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001686 if (!res)
1687 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001688 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001689 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1690 else
1691 for (i = 0; i < size; i++)
1692 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001693 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001694 return res;
1695}
1696
Victor Stinnere57b1c02011-09-28 22:20:48 +02001697static PyObject*
1698_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001699{
1700 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001701 Py_UCS4 max_char = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001702 Py_ssize_t i;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001703
1704 assert(size >= 0);
1705 for (i = 0; i < size; i++) {
1706 if (u[i] > max_char) {
1707 max_char = u[i];
1708 if (max_char >= 0x10000)
1709 break;
1710 }
1711 }
1712 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001713 if (!res)
1714 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001715 if (max_char >= 0x10000)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001716 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1717 else {
1718 int kind = PyUnicode_KIND(res);
1719 void *data = PyUnicode_DATA(res);
1720 for (i = 0; i < size; i++)
1721 PyUnicode_WRITE(kind, data, i, u[i]);
1722 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001723 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001724 return res;
1725}
1726
1727PyObject*
1728PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1729{
1730 switch(kind) {
1731 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001732 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001733 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001734 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001735 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001736 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001737 default:
1738 assert(0 && "invalid kind");
1739 PyErr_SetString(PyExc_SystemError, "invalid kind");
1740 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001741 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001742}
1743
Victor Stinner25a4b292011-10-06 12:31:55 +02001744/* Ensure that a string uses the most efficient storage, if it is not the
1745 case: create a new string with of the right kind. Write NULL into *p_unicode
1746 on error. */
1747void
1748unicode_adjust_maxchar(PyObject **p_unicode)
1749{
1750 PyObject *unicode, *copy;
1751 Py_UCS4 max_char;
1752 Py_ssize_t i, len;
1753 unsigned int kind;
1754
1755 assert(p_unicode != NULL);
1756 unicode = *p_unicode;
1757 assert(PyUnicode_IS_READY(unicode));
1758 if (PyUnicode_IS_ASCII(unicode))
1759 return;
1760
1761 len = PyUnicode_GET_LENGTH(unicode);
1762 kind = PyUnicode_KIND(unicode);
1763 if (kind == PyUnicode_1BYTE_KIND) {
1764 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
1765 for (i = 0; i < len; i++) {
1766 if (u[i] & 0x80)
1767 return;
1768 }
1769 max_char = 127;
1770 }
1771 else if (kind == PyUnicode_2BYTE_KIND) {
1772 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
1773 max_char = 0;
1774 for (i = 0; i < len; i++) {
1775 if (u[i] > max_char) {
1776 max_char = u[i];
1777 if (max_char >= 256)
1778 return;
1779 }
1780 }
1781 }
1782 else {
1783 assert(kind == PyUnicode_4BYTE_KIND);
1784 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
1785 max_char = 0;
1786 for (i = 0; i < len; i++) {
1787 if (u[i] > max_char) {
1788 max_char = u[i];
1789 if (max_char >= 0x10000)
1790 return;
1791 }
1792 }
1793 }
Victor Stinner200f2132011-10-06 13:27:56 +02001794 assert(max_char < PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinner25a4b292011-10-06 12:31:55 +02001795 copy = PyUnicode_New(len, max_char);
1796 copy_characters(copy, 0, unicode, 0, len);
1797 Py_DECREF(unicode);
1798 *p_unicode = copy;
1799}
1800
Victor Stinner034f6cf2011-09-30 02:26:44 +02001801PyObject*
1802PyUnicode_Copy(PyObject *unicode)
1803{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001804 Py_ssize_t size;
1805 PyObject *copy;
1806 void *data;
1807
Victor Stinner034f6cf2011-09-30 02:26:44 +02001808 if (!PyUnicode_Check(unicode)) {
1809 PyErr_BadInternalCall();
1810 return NULL;
1811 }
1812 if (PyUnicode_READY(unicode))
1813 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001814
1815 size = PyUnicode_GET_LENGTH(unicode);
1816 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1817 if (!copy)
1818 return NULL;
1819 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1820
1821 data = PyUnicode_DATA(unicode);
1822 switch (PyUnicode_KIND(unicode))
1823 {
1824 case PyUnicode_1BYTE_KIND:
1825 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1826 break;
1827 case PyUnicode_2BYTE_KIND:
1828 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1829 break;
1830 case PyUnicode_4BYTE_KIND:
1831 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1832 break;
1833 default:
1834 assert(0);
1835 break;
1836 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001837 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001838 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001839}
1840
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001841
Victor Stinnerbc603d12011-10-02 01:00:40 +02001842/* Widen Unicode objects to larger buffers. Don't write terminating null
1843 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001844
1845void*
1846_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1847{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001848 Py_ssize_t len;
1849 void *result;
1850 unsigned int skind;
1851
1852 if (PyUnicode_READY(s))
1853 return NULL;
1854
1855 len = PyUnicode_GET_LENGTH(s);
1856 skind = PyUnicode_KIND(s);
1857 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02001858 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001859 return NULL;
1860 }
1861 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001862 case PyUnicode_2BYTE_KIND:
1863 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1864 if (!result)
1865 return PyErr_NoMemory();
1866 assert(skind == PyUnicode_1BYTE_KIND);
1867 _PyUnicode_CONVERT_BYTES(
1868 Py_UCS1, Py_UCS2,
1869 PyUnicode_1BYTE_DATA(s),
1870 PyUnicode_1BYTE_DATA(s) + len,
1871 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001872 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001873 case PyUnicode_4BYTE_KIND:
1874 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1875 if (!result)
1876 return PyErr_NoMemory();
1877 if (skind == PyUnicode_2BYTE_KIND) {
1878 _PyUnicode_CONVERT_BYTES(
1879 Py_UCS2, Py_UCS4,
1880 PyUnicode_2BYTE_DATA(s),
1881 PyUnicode_2BYTE_DATA(s) + len,
1882 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001883 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001884 else {
1885 assert(skind == PyUnicode_1BYTE_KIND);
1886 _PyUnicode_CONVERT_BYTES(
1887 Py_UCS1, Py_UCS4,
1888 PyUnicode_1BYTE_DATA(s),
1889 PyUnicode_1BYTE_DATA(s) + len,
1890 result);
1891 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001892 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001893 default:
1894 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001895 }
Victor Stinner01698042011-10-04 00:04:26 +02001896 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001897 return NULL;
1898}
1899
1900static Py_UCS4*
1901as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1902 int copy_null)
1903{
1904 int kind;
1905 void *data;
1906 Py_ssize_t len, targetlen;
1907 if (PyUnicode_READY(string) == -1)
1908 return NULL;
1909 kind = PyUnicode_KIND(string);
1910 data = PyUnicode_DATA(string);
1911 len = PyUnicode_GET_LENGTH(string);
1912 targetlen = len;
1913 if (copy_null)
1914 targetlen++;
1915 if (!target) {
1916 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1917 PyErr_NoMemory();
1918 return NULL;
1919 }
1920 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1921 if (!target) {
1922 PyErr_NoMemory();
1923 return NULL;
1924 }
1925 }
1926 else {
1927 if (targetsize < targetlen) {
1928 PyErr_Format(PyExc_SystemError,
1929 "string is longer than the buffer");
1930 if (copy_null && 0 < targetsize)
1931 target[0] = 0;
1932 return NULL;
1933 }
1934 }
1935 if (kind != PyUnicode_4BYTE_KIND) {
1936 Py_ssize_t i;
1937 for (i = 0; i < len; i++)
1938 target[i] = PyUnicode_READ(kind, data, i);
1939 }
1940 else
1941 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1942 if (copy_null)
1943 target[len] = 0;
1944 return target;
1945}
1946
1947Py_UCS4*
1948PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1949 int copy_null)
1950{
1951 if (target == NULL || targetsize < 1) {
1952 PyErr_BadInternalCall();
1953 return NULL;
1954 }
1955 return as_ucs4(string, target, targetsize, copy_null);
1956}
1957
1958Py_UCS4*
1959PyUnicode_AsUCS4Copy(PyObject *string)
1960{
1961 return as_ucs4(string, NULL, 0, 1);
1962}
1963
1964#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00001965
Alexander Belopolsky40018472011-02-26 01:02:56 +00001966PyObject *
1967PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001968{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001969 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00001970 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001971 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001972 PyErr_BadInternalCall();
1973 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001974 }
1975
Martin v. Löwis790465f2008-04-05 20:41:37 +00001976 if (size == -1) {
1977 size = wcslen(w);
1978 }
1979
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001980 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001981}
1982
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001983#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00001984
Walter Dörwald346737f2007-05-31 10:44:43 +00001985static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001986makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1987 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00001988{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001989 *fmt++ = '%';
1990 if (width) {
1991 if (zeropad)
1992 *fmt++ = '0';
1993 fmt += sprintf(fmt, "%d", width);
1994 }
1995 if (precision)
1996 fmt += sprintf(fmt, ".%d", precision);
1997 if (longflag)
1998 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001999 else if (longlongflag) {
2000 /* longlongflag should only ever be nonzero on machines with
2001 HAVE_LONG_LONG defined */
2002#ifdef HAVE_LONG_LONG
2003 char *f = PY_FORMAT_LONG_LONG;
2004 while (*f)
2005 *fmt++ = *f++;
2006#else
2007 /* we shouldn't ever get here */
2008 assert(0);
2009 *fmt++ = 'l';
2010#endif
2011 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002012 else if (size_tflag) {
2013 char *f = PY_FORMAT_SIZE_T;
2014 while (*f)
2015 *fmt++ = *f++;
2016 }
2017 *fmt++ = c;
2018 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002019}
2020
Victor Stinner96865452011-03-01 23:44:09 +00002021/* helper for PyUnicode_FromFormatV() */
2022
2023static const char*
2024parse_format_flags(const char *f,
2025 int *p_width, int *p_precision,
2026 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2027{
2028 int width, precision, longflag, longlongflag, size_tflag;
2029
2030 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2031 f++;
2032 width = 0;
2033 while (Py_ISDIGIT((unsigned)*f))
2034 width = (width*10) + *f++ - '0';
2035 precision = 0;
2036 if (*f == '.') {
2037 f++;
2038 while (Py_ISDIGIT((unsigned)*f))
2039 precision = (precision*10) + *f++ - '0';
2040 if (*f == '%') {
2041 /* "%.3%s" => f points to "3" */
2042 f--;
2043 }
2044 }
2045 if (*f == '\0') {
2046 /* bogus format "%.1" => go backward, f points to "1" */
2047 f--;
2048 }
2049 if (p_width != NULL)
2050 *p_width = width;
2051 if (p_precision != NULL)
2052 *p_precision = precision;
2053
2054 /* Handle %ld, %lu, %lld and %llu. */
2055 longflag = 0;
2056 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002057 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002058
2059 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002060 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002061 longflag = 1;
2062 ++f;
2063 }
2064#ifdef HAVE_LONG_LONG
2065 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002066 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002067 longlongflag = 1;
2068 f += 2;
2069 }
2070#endif
2071 }
2072 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002073 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002074 size_tflag = 1;
2075 ++f;
2076 }
2077 if (p_longflag != NULL)
2078 *p_longflag = longflag;
2079 if (p_longlongflag != NULL)
2080 *p_longlongflag = longlongflag;
2081 if (p_size_tflag != NULL)
2082 *p_size_tflag = size_tflag;
2083 return f;
2084}
2085
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002086/* maximum number of characters required for output of %ld. 21 characters
2087 allows for 64-bit integers (in decimal) and an optional sign. */
2088#define MAX_LONG_CHARS 21
2089/* maximum number of characters required for output of %lld.
2090 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2091 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2092#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2093
Walter Dörwaldd2034312007-05-18 16:29:38 +00002094PyObject *
2095PyUnicode_FromFormatV(const char *format, va_list vargs)
2096{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002097 va_list count;
2098 Py_ssize_t callcount = 0;
2099 PyObject **callresults = NULL;
2100 PyObject **callresult = NULL;
2101 Py_ssize_t n = 0;
2102 int width = 0;
2103 int precision = 0;
2104 int zeropad;
2105 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002106 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002107 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002108 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002109 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2110 Py_UCS4 argmaxchar;
2111 Py_ssize_t numbersize = 0;
2112 char *numberresults = NULL;
2113 char *numberresult = NULL;
2114 Py_ssize_t i;
2115 int kind;
2116 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002117
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002118 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002119 /* step 1: count the number of %S/%R/%A/%s format specifications
2120 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2121 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002122 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002123 * also estimate a upper bound for all the number formats in the string,
2124 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002125 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002126 for (f = format; *f; f++) {
2127 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002128 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002129 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2130 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2131 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2132 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002133
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002134 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002135#ifdef HAVE_LONG_LONG
2136 if (longlongflag) {
2137 if (width < MAX_LONG_LONG_CHARS)
2138 width = MAX_LONG_LONG_CHARS;
2139 }
2140 else
2141#endif
2142 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2143 including sign. Decimal takes the most space. This
2144 isn't enough for octal. If a width is specified we
2145 need more (which we allocate later). */
2146 if (width < MAX_LONG_CHARS)
2147 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002148
2149 /* account for the size + '\0' to separate numbers
2150 inside of the numberresults buffer */
2151 numbersize += (width + 1);
2152 }
2153 }
2154 else if ((unsigned char)*f > 127) {
2155 PyErr_Format(PyExc_ValueError,
2156 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2157 "string, got a non-ASCII byte: 0x%02x",
2158 (unsigned char)*f);
2159 return NULL;
2160 }
2161 }
2162 /* step 2: allocate memory for the results of
2163 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2164 if (callcount) {
2165 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2166 if (!callresults) {
2167 PyErr_NoMemory();
2168 return NULL;
2169 }
2170 callresult = callresults;
2171 }
2172 /* step 2.5: allocate memory for the results of formating numbers */
2173 if (numbersize) {
2174 numberresults = PyObject_Malloc(numbersize);
2175 if (!numberresults) {
2176 PyErr_NoMemory();
2177 goto fail;
2178 }
2179 numberresult = numberresults;
2180 }
2181
2182 /* step 3: format numbers and figure out how large a buffer we need */
2183 for (f = format; *f; f++) {
2184 if (*f == '%') {
2185 const char* p;
2186 int longflag;
2187 int longlongflag;
2188 int size_tflag;
2189 int numprinted;
2190
2191 p = f;
2192 zeropad = (f[1] == '0');
2193 f = parse_format_flags(f, &width, &precision,
2194 &longflag, &longlongflag, &size_tflag);
2195 switch (*f) {
2196 case 'c':
2197 {
2198 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002199 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002200 n++;
2201 break;
2202 }
2203 case '%':
2204 n++;
2205 break;
2206 case 'i':
2207 case 'd':
2208 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2209 width, precision, *f);
2210 if (longflag)
2211 numprinted = sprintf(numberresult, fmt,
2212 va_arg(count, long));
2213#ifdef HAVE_LONG_LONG
2214 else if (longlongflag)
2215 numprinted = sprintf(numberresult, fmt,
2216 va_arg(count, PY_LONG_LONG));
2217#endif
2218 else if (size_tflag)
2219 numprinted = sprintf(numberresult, fmt,
2220 va_arg(count, Py_ssize_t));
2221 else
2222 numprinted = sprintf(numberresult, fmt,
2223 va_arg(count, int));
2224 n += numprinted;
2225 /* advance by +1 to skip over the '\0' */
2226 numberresult += (numprinted + 1);
2227 assert(*(numberresult - 1) == '\0');
2228 assert(*(numberresult - 2) != '\0');
2229 assert(numprinted >= 0);
2230 assert(numberresult <= numberresults + numbersize);
2231 break;
2232 case 'u':
2233 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2234 width, precision, 'u');
2235 if (longflag)
2236 numprinted = sprintf(numberresult, fmt,
2237 va_arg(count, unsigned long));
2238#ifdef HAVE_LONG_LONG
2239 else if (longlongflag)
2240 numprinted = sprintf(numberresult, fmt,
2241 va_arg(count, unsigned PY_LONG_LONG));
2242#endif
2243 else if (size_tflag)
2244 numprinted = sprintf(numberresult, fmt,
2245 va_arg(count, size_t));
2246 else
2247 numprinted = sprintf(numberresult, fmt,
2248 va_arg(count, unsigned int));
2249 n += numprinted;
2250 numberresult += (numprinted + 1);
2251 assert(*(numberresult - 1) == '\0');
2252 assert(*(numberresult - 2) != '\0');
2253 assert(numprinted >= 0);
2254 assert(numberresult <= numberresults + numbersize);
2255 break;
2256 case 'x':
2257 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2258 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2259 n += numprinted;
2260 numberresult += (numprinted + 1);
2261 assert(*(numberresult - 1) == '\0');
2262 assert(*(numberresult - 2) != '\0');
2263 assert(numprinted >= 0);
2264 assert(numberresult <= numberresults + numbersize);
2265 break;
2266 case 'p':
2267 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2268 /* %p is ill-defined: ensure leading 0x. */
2269 if (numberresult[1] == 'X')
2270 numberresult[1] = 'x';
2271 else if (numberresult[1] != 'x') {
2272 memmove(numberresult + 2, numberresult,
2273 strlen(numberresult) + 1);
2274 numberresult[0] = '0';
2275 numberresult[1] = 'x';
2276 numprinted += 2;
2277 }
2278 n += numprinted;
2279 numberresult += (numprinted + 1);
2280 assert(*(numberresult - 1) == '\0');
2281 assert(*(numberresult - 2) != '\0');
2282 assert(numprinted >= 0);
2283 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002284 break;
2285 case 's':
2286 {
2287 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002288 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002289 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2290 if (!str)
2291 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002292 /* since PyUnicode_DecodeUTF8 returns already flexible
2293 unicode objects, there is no need to call ready on them */
2294 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002295 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002296 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002297 /* Remember the str and switch to the next slot */
2298 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002299 break;
2300 }
2301 case 'U':
2302 {
2303 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002304 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002305 if (PyUnicode_READY(obj) == -1)
2306 goto fail;
2307 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002308 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002309 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002310 break;
2311 }
2312 case 'V':
2313 {
2314 PyObject *obj = va_arg(count, PyObject *);
2315 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002316 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002317 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002318 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002319 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002320 if (PyUnicode_READY(obj) == -1)
2321 goto fail;
2322 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002323 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002324 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002325 *callresult++ = NULL;
2326 }
2327 else {
2328 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2329 if (!str_obj)
2330 goto fail;
Victor Stinnere1335c72011-10-04 20:53:03 +02002331 if (PyUnicode_READY(str_obj)) {
2332 Py_DECREF(str_obj);
2333 goto fail;
2334 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002335 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002336 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002337 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002338 *callresult++ = str_obj;
2339 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002340 break;
2341 }
2342 case 'S':
2343 {
2344 PyObject *obj = va_arg(count, PyObject *);
2345 PyObject *str;
2346 assert(obj);
2347 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002348 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002349 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002350 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002351 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002352 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002353 /* Remember the str and switch to the next slot */
2354 *callresult++ = str;
2355 break;
2356 }
2357 case 'R':
2358 {
2359 PyObject *obj = va_arg(count, PyObject *);
2360 PyObject *repr;
2361 assert(obj);
2362 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002363 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002364 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002365 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002366 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002367 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002368 /* Remember the repr and switch to the next slot */
2369 *callresult++ = repr;
2370 break;
2371 }
2372 case 'A':
2373 {
2374 PyObject *obj = va_arg(count, PyObject *);
2375 PyObject *ascii;
2376 assert(obj);
2377 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002378 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002379 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002380 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002381 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002382 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002383 /* Remember the repr and switch to the next slot */
2384 *callresult++ = ascii;
2385 break;
2386 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002387 default:
2388 /* if we stumble upon an unknown
2389 formatting code, copy the rest of
2390 the format string to the output
2391 string. (we cannot just skip the
2392 code, since there's no way to know
2393 what's in the argument list) */
2394 n += strlen(p);
2395 goto expand;
2396 }
2397 } else
2398 n++;
2399 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002400 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002401 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002402 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002403 we don't have to resize the string.
2404 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002405 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002406 if (!string)
2407 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002408 kind = PyUnicode_KIND(string);
2409 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002410 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002411 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002412
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002413 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002414 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002415 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002416
2417 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002418 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2419 /* checking for == because the last argument could be a empty
2420 string, which causes i to point to end, the assert at the end of
2421 the loop */
2422 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002423
Benjamin Peterson14339b62009-01-31 16:36:08 +00002424 switch (*f) {
2425 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002426 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002427 const int ordinal = va_arg(vargs, int);
2428 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002429 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002430 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002431 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002432 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002433 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002434 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002435 case 'p':
2436 /* unused, since we already have the result */
2437 if (*f == 'p')
2438 (void) va_arg(vargs, void *);
2439 else
2440 (void) va_arg(vargs, int);
2441 /* extract the result from numberresults and append. */
2442 for (; *numberresult; ++i, ++numberresult)
2443 PyUnicode_WRITE(kind, data, i, *numberresult);
2444 /* skip over the separating '\0' */
2445 assert(*numberresult == '\0');
2446 numberresult++;
2447 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002448 break;
2449 case 's':
2450 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002451 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002452 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002453 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002454 size = PyUnicode_GET_LENGTH(*callresult);
2455 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002456 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002457 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002458 /* We're done with the unicode()/repr() => forget it */
2459 Py_DECREF(*callresult);
2460 /* switch to next unicode()/repr() result */
2461 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002462 break;
2463 }
2464 case 'U':
2465 {
2466 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002467 Py_ssize_t size;
2468 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2469 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002470 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002471 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002472 break;
2473 }
2474 case 'V':
2475 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002476 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002477 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002478 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002479 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002480 size = PyUnicode_GET_LENGTH(obj);
2481 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002482 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002483 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002484 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002485 size = PyUnicode_GET_LENGTH(*callresult);
2486 assert(PyUnicode_KIND(*callresult) <=
2487 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002488 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002489 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002490 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002491 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002492 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002493 break;
2494 }
2495 case 'S':
2496 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002497 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002498 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002499 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002500 /* unused, since we already have the result */
2501 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002502 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002503 copy_characters(string, i, *callresult, 0, size);
2504 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002505 /* We're done with the unicode()/repr() => forget it */
2506 Py_DECREF(*callresult);
2507 /* switch to next unicode()/repr() result */
2508 ++callresult;
2509 break;
2510 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002511 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002512 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002513 break;
2514 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002515 for (; *p; ++p, ++i)
2516 PyUnicode_WRITE(kind, data, i, *p);
2517 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002518 goto end;
2519 }
Victor Stinner1205f272010-09-11 00:54:47 +00002520 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002521 else {
2522 assert(i < PyUnicode_GET_LENGTH(string));
2523 PyUnicode_WRITE(kind, data, i++, *f);
2524 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002525 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002526 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002527
Benjamin Peterson29060642009-01-31 22:14:21 +00002528 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002529 if (callresults)
2530 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002531 if (numberresults)
2532 PyObject_Free(numberresults);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002533 assert(_PyUnicode_CheckConsistency(string, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002534 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00002535 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002536 if (callresults) {
2537 PyObject **callresult2 = callresults;
2538 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002539 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002540 ++callresult2;
2541 }
2542 PyObject_Free(callresults);
2543 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002544 if (numberresults)
2545 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002546 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002547}
2548
Walter Dörwaldd2034312007-05-18 16:29:38 +00002549PyObject *
2550PyUnicode_FromFormat(const char *format, ...)
2551{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002552 PyObject* ret;
2553 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002554
2555#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002556 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002557#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002558 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002559#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002560 ret = PyUnicode_FromFormatV(format, vargs);
2561 va_end(vargs);
2562 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002563}
2564
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002565#ifdef HAVE_WCHAR_H
2566
Victor Stinner5593d8a2010-10-02 11:11:27 +00002567/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2568 convert a Unicode object to a wide character string.
2569
Victor Stinnerd88d9832011-09-06 02:00:05 +02002570 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002571 character) required to convert the unicode object. Ignore size argument.
2572
Victor Stinnerd88d9832011-09-06 02:00:05 +02002573 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002574 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002575 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002576static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00002577unicode_aswidechar(PyUnicodeObject *unicode,
2578 wchar_t *w,
2579 Py_ssize_t size)
2580{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002581 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002582 const wchar_t *wstr;
2583
2584 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
2585 if (wstr == NULL)
2586 return -1;
2587
Victor Stinner5593d8a2010-10-02 11:11:27 +00002588 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002589 if (size > res)
2590 size = res + 1;
2591 else
2592 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002593 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002594 return res;
2595 }
2596 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002597 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002598}
2599
2600Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002601PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002602 wchar_t *w,
2603 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002604{
2605 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002606 PyErr_BadInternalCall();
2607 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002608 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002609 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002610}
2611
Victor Stinner137c34c2010-09-29 10:25:54 +00002612wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002613PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002614 Py_ssize_t *size)
2615{
2616 wchar_t* buffer;
2617 Py_ssize_t buflen;
2618
2619 if (unicode == NULL) {
2620 PyErr_BadInternalCall();
2621 return NULL;
2622 }
2623
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002624 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002625 if (buflen == -1)
2626 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002627 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002628 PyErr_NoMemory();
2629 return NULL;
2630 }
2631
Victor Stinner137c34c2010-09-29 10:25:54 +00002632 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2633 if (buffer == NULL) {
2634 PyErr_NoMemory();
2635 return NULL;
2636 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002637 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002638 if (buflen == -1)
2639 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002640 if (size != NULL)
2641 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002642 return buffer;
2643}
2644
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002645#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002646
Alexander Belopolsky40018472011-02-26 01:02:56 +00002647PyObject *
2648PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002649{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002650 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002651 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002652 PyErr_SetString(PyExc_ValueError,
2653 "chr() arg not in range(0x110000)");
2654 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002655 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002656
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002657 if (ordinal < 256)
2658 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002659
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002660 v = PyUnicode_New(1, ordinal);
2661 if (v == NULL)
2662 return NULL;
2663 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002664 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002665 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002666}
2667
Alexander Belopolsky40018472011-02-26 01:02:56 +00002668PyObject *
2669PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002670{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002671 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002672 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002673 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002674 if (PyUnicode_READY(obj))
2675 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002676 Py_INCREF(obj);
2677 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002678 }
2679 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002680 /* For a Unicode subtype that's not a Unicode object,
2681 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002682 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002683 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002684 PyErr_Format(PyExc_TypeError,
2685 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002686 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002687 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002688}
2689
Alexander Belopolsky40018472011-02-26 01:02:56 +00002690PyObject *
2691PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002692 const char *encoding,
2693 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002694{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002695 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002696 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002697
Guido van Rossumd57fd912000-03-10 22:53:23 +00002698 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002699 PyErr_BadInternalCall();
2700 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002701 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002702
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002703 /* Decoding bytes objects is the most common case and should be fast */
2704 if (PyBytes_Check(obj)) {
2705 if (PyBytes_GET_SIZE(obj) == 0) {
2706 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002707 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002708 }
2709 else {
2710 v = PyUnicode_Decode(
2711 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2712 encoding, errors);
2713 }
2714 return v;
2715 }
2716
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002717 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002718 PyErr_SetString(PyExc_TypeError,
2719 "decoding str is not supported");
2720 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002721 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002722
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002723 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2724 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2725 PyErr_Format(PyExc_TypeError,
2726 "coercing to str: need bytes, bytearray "
2727 "or buffer-like object, %.80s found",
2728 Py_TYPE(obj)->tp_name);
2729 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002730 }
Tim Petersced69f82003-09-16 20:30:58 +00002731
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002732 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002733 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002734 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002735 }
Tim Petersced69f82003-09-16 20:30:58 +00002736 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002737 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002738
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002739 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002740 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002741}
2742
Victor Stinner600d3be2010-06-10 12:00:55 +00002743/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002744 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2745 1 on success. */
2746static int
2747normalize_encoding(const char *encoding,
2748 char *lower,
2749 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002750{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002751 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002752 char *l;
2753 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002754
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002755 e = encoding;
2756 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002757 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002758 while (*e) {
2759 if (l == l_end)
2760 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002761 if (Py_ISUPPER(*e)) {
2762 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002763 }
2764 else if (*e == '_') {
2765 *l++ = '-';
2766 e++;
2767 }
2768 else {
2769 *l++ = *e++;
2770 }
2771 }
2772 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002773 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002774}
2775
Alexander Belopolsky40018472011-02-26 01:02:56 +00002776PyObject *
2777PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002778 Py_ssize_t size,
2779 const char *encoding,
2780 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002781{
2782 PyObject *buffer = NULL, *unicode;
2783 Py_buffer info;
2784 char lower[11]; /* Enough for any encoding shortcut */
2785
2786 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002787 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002788
2789 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002790 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002791 if ((strcmp(lower, "utf-8") == 0) ||
2792 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002793 return PyUnicode_DecodeUTF8(s, size, errors);
2794 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002795 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002796 (strcmp(lower, "iso-8859-1") == 0))
2797 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002798#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002799 else if (strcmp(lower, "mbcs") == 0)
2800 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002801#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002802 else if (strcmp(lower, "ascii") == 0)
2803 return PyUnicode_DecodeASCII(s, size, errors);
2804 else if (strcmp(lower, "utf-16") == 0)
2805 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2806 else if (strcmp(lower, "utf-32") == 0)
2807 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2808 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002809
2810 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002811 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002812 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002813 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002814 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002815 if (buffer == NULL)
2816 goto onError;
2817 unicode = PyCodec_Decode(buffer, encoding, errors);
2818 if (unicode == NULL)
2819 goto onError;
2820 if (!PyUnicode_Check(unicode)) {
2821 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002822 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002823 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002824 Py_DECREF(unicode);
2825 goto onError;
2826 }
2827 Py_DECREF(buffer);
Victor Stinner17efeed2011-10-04 20:05:46 +02002828#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02002829 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002830 Py_DECREF(unicode);
2831 return NULL;
2832 }
Victor Stinner17efeed2011-10-04 20:05:46 +02002833#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002834 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002835 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002836
Benjamin Peterson29060642009-01-31 22:14:21 +00002837 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002838 Py_XDECREF(buffer);
2839 return NULL;
2840}
2841
Alexander Belopolsky40018472011-02-26 01:02:56 +00002842PyObject *
2843PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002844 const char *encoding,
2845 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002846{
2847 PyObject *v;
2848
2849 if (!PyUnicode_Check(unicode)) {
2850 PyErr_BadArgument();
2851 goto onError;
2852 }
2853
2854 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002855 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002856
2857 /* Decode via the codec registry */
2858 v = PyCodec_Decode(unicode, encoding, errors);
2859 if (v == NULL)
2860 goto onError;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002861 assert(_PyUnicode_CheckConsistency(v, 1));
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002862 return v;
2863
Benjamin Peterson29060642009-01-31 22:14:21 +00002864 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002865 return NULL;
2866}
2867
Alexander Belopolsky40018472011-02-26 01:02:56 +00002868PyObject *
2869PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002870 const char *encoding,
2871 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002872{
2873 PyObject *v;
2874
2875 if (!PyUnicode_Check(unicode)) {
2876 PyErr_BadArgument();
2877 goto onError;
2878 }
2879
2880 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002881 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002882
2883 /* Decode via the codec registry */
2884 v = PyCodec_Decode(unicode, encoding, errors);
2885 if (v == NULL)
2886 goto onError;
2887 if (!PyUnicode_Check(v)) {
2888 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002889 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002890 Py_TYPE(v)->tp_name);
2891 Py_DECREF(v);
2892 goto onError;
2893 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002894 assert(_PyUnicode_CheckConsistency(v, 1));
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002895 return v;
2896
Benjamin Peterson29060642009-01-31 22:14:21 +00002897 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002898 return NULL;
2899}
2900
Alexander Belopolsky40018472011-02-26 01:02:56 +00002901PyObject *
2902PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002903 Py_ssize_t size,
2904 const char *encoding,
2905 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002906{
2907 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002908
Guido van Rossumd57fd912000-03-10 22:53:23 +00002909 unicode = PyUnicode_FromUnicode(s, size);
2910 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002911 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002912 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2913 Py_DECREF(unicode);
2914 return v;
2915}
2916
Alexander Belopolsky40018472011-02-26 01:02:56 +00002917PyObject *
2918PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002919 const char *encoding,
2920 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002921{
2922 PyObject *v;
2923
2924 if (!PyUnicode_Check(unicode)) {
2925 PyErr_BadArgument();
2926 goto onError;
2927 }
2928
2929 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002930 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002931
2932 /* Encode via the codec registry */
2933 v = PyCodec_Encode(unicode, encoding, errors);
2934 if (v == NULL)
2935 goto onError;
2936 return v;
2937
Benjamin Peterson29060642009-01-31 22:14:21 +00002938 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002939 return NULL;
2940}
2941
Victor Stinnerad158722010-10-27 00:25:46 +00002942PyObject *
2943PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002944{
Victor Stinner99b95382011-07-04 14:23:54 +02002945#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002946 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2947 PyUnicode_GET_SIZE(unicode),
2948 NULL);
2949#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002950 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002951#else
Victor Stinner793b5312011-04-27 00:24:21 +02002952 PyInterpreterState *interp = PyThreadState_GET()->interp;
2953 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2954 cannot use it to encode and decode filenames before it is loaded. Load
2955 the Python codec requires to encode at least its own filename. Use the C
2956 version of the locale codec until the codec registry is initialized and
2957 the Python codec is loaded.
2958
2959 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2960 cannot only rely on it: check also interp->fscodec_initialized for
2961 subinterpreters. */
2962 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00002963 return PyUnicode_AsEncodedString(unicode,
2964 Py_FileSystemDefaultEncoding,
2965 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00002966 }
2967 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002968 /* locale encoding with surrogateescape */
2969 wchar_t *wchar;
2970 char *bytes;
2971 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00002972 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002973
2974 wchar = PyUnicode_AsWideCharString(unicode, NULL);
2975 if (wchar == NULL)
2976 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002977 bytes = _Py_wchar2char(wchar, &error_pos);
2978 if (bytes == NULL) {
2979 if (error_pos != (size_t)-1) {
2980 char *errmsg = strerror(errno);
2981 PyObject *exc = NULL;
2982 if (errmsg == NULL)
2983 errmsg = "Py_wchar2char() failed";
2984 raise_encode_exception(&exc,
2985 "filesystemencoding",
2986 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2987 error_pos, error_pos+1,
2988 errmsg);
2989 Py_XDECREF(exc);
2990 }
2991 else
2992 PyErr_NoMemory();
2993 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002994 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002995 }
2996 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002997
2998 bytes_obj = PyBytes_FromString(bytes);
2999 PyMem_Free(bytes);
3000 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00003001 }
Victor Stinnerad158722010-10-27 00:25:46 +00003002#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003003}
3004
Alexander Belopolsky40018472011-02-26 01:02:56 +00003005PyObject *
3006PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003007 const char *encoding,
3008 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003009{
3010 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003011 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003012
Guido van Rossumd57fd912000-03-10 22:53:23 +00003013 if (!PyUnicode_Check(unicode)) {
3014 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003015 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003016 }
Fred Drakee4315f52000-05-09 19:53:39 +00003017
Victor Stinner2f283c22011-03-02 01:21:46 +00003018 if (encoding == NULL) {
3019 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003020 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003021 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003022 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00003023 }
Fred Drakee4315f52000-05-09 19:53:39 +00003024
3025 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003026 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003027 if ((strcmp(lower, "utf-8") == 0) ||
3028 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003029 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003030 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003031 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003032 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003033 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003034 }
Victor Stinner37296e82010-06-10 13:36:23 +00003035 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003036 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003037 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003038 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003039#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003040 else if (strcmp(lower, "mbcs") == 0)
3041 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3042 PyUnicode_GET_SIZE(unicode),
3043 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003044#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003045 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003046 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003047 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003048
3049 /* Encode via the codec registry */
3050 v = PyCodec_Encode(unicode, encoding, errors);
3051 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003052 return NULL;
3053
3054 /* The normal path */
3055 if (PyBytes_Check(v))
3056 return v;
3057
3058 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003059 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003060 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003061 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003062
3063 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3064 "encoder %s returned bytearray instead of bytes",
3065 encoding);
3066 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003067 Py_DECREF(v);
3068 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003069 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003070
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003071 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3072 Py_DECREF(v);
3073 return b;
3074 }
3075
3076 PyErr_Format(PyExc_TypeError,
3077 "encoder did not return a bytes object (type=%.400s)",
3078 Py_TYPE(v)->tp_name);
3079 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003080 return NULL;
3081}
3082
Alexander Belopolsky40018472011-02-26 01:02:56 +00003083PyObject *
3084PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003085 const char *encoding,
3086 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003087{
3088 PyObject *v;
3089
3090 if (!PyUnicode_Check(unicode)) {
3091 PyErr_BadArgument();
3092 goto onError;
3093 }
3094
3095 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003096 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003097
3098 /* Encode via the codec registry */
3099 v = PyCodec_Encode(unicode, encoding, errors);
3100 if (v == NULL)
3101 goto onError;
3102 if (!PyUnicode_Check(v)) {
3103 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003104 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003105 Py_TYPE(v)->tp_name);
3106 Py_DECREF(v);
3107 goto onError;
3108 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003109 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003110
Benjamin Peterson29060642009-01-31 22:14:21 +00003111 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003112 return NULL;
3113}
3114
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003115PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003116PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003117 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003118 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3119}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003120
Christian Heimes5894ba72007-11-04 11:43:14 +00003121PyObject*
3122PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3123{
Victor Stinner99b95382011-07-04 14:23:54 +02003124#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003125 return PyUnicode_DecodeMBCS(s, size, NULL);
3126#elif defined(__APPLE__)
3127 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
3128#else
Victor Stinner793b5312011-04-27 00:24:21 +02003129 PyInterpreterState *interp = PyThreadState_GET()->interp;
3130 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3131 cannot use it to encode and decode filenames before it is loaded. Load
3132 the Python codec requires to encode at least its own filename. Use the C
3133 version of the locale codec until the codec registry is initialized and
3134 the Python codec is loaded.
3135
3136 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3137 cannot only rely on it: check also interp->fscodec_initialized for
3138 subinterpreters. */
3139 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003140 return PyUnicode_Decode(s, size,
3141 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003142 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003143 }
3144 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003145 /* locale encoding with surrogateescape */
3146 wchar_t *wchar;
3147 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00003148 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003149
3150 if (s[size] != '\0' || size != strlen(s)) {
3151 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3152 return NULL;
3153 }
3154
Victor Stinner168e1172010-10-16 23:16:16 +00003155 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003156 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00003157 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003158
Victor Stinner168e1172010-10-16 23:16:16 +00003159 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003160 PyMem_Free(wchar);
3161 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003162 }
Victor Stinnerad158722010-10-27 00:25:46 +00003163#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003164}
3165
Martin v. Löwis011e8422009-05-05 04:43:17 +00003166
3167int
3168PyUnicode_FSConverter(PyObject* arg, void* addr)
3169{
3170 PyObject *output = NULL;
3171 Py_ssize_t size;
3172 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003173 if (arg == NULL) {
3174 Py_DECREF(*(PyObject**)addr);
3175 return 1;
3176 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003177 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003178 output = arg;
3179 Py_INCREF(output);
3180 }
3181 else {
3182 arg = PyUnicode_FromObject(arg);
3183 if (!arg)
3184 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003185 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003186 Py_DECREF(arg);
3187 if (!output)
3188 return 0;
3189 if (!PyBytes_Check(output)) {
3190 Py_DECREF(output);
3191 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3192 return 0;
3193 }
3194 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003195 size = PyBytes_GET_SIZE(output);
3196 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003197 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003198 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003199 Py_DECREF(output);
3200 return 0;
3201 }
3202 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003203 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003204}
3205
3206
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003207int
3208PyUnicode_FSDecoder(PyObject* arg, void* addr)
3209{
3210 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003211 if (arg == NULL) {
3212 Py_DECREF(*(PyObject**)addr);
3213 return 1;
3214 }
3215 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003216 if (PyUnicode_READY(arg))
3217 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003218 output = arg;
3219 Py_INCREF(output);
3220 }
3221 else {
3222 arg = PyBytes_FromObject(arg);
3223 if (!arg)
3224 return 0;
3225 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3226 PyBytes_GET_SIZE(arg));
3227 Py_DECREF(arg);
3228 if (!output)
3229 return 0;
3230 if (!PyUnicode_Check(output)) {
3231 Py_DECREF(output);
3232 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3233 return 0;
3234 }
3235 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003236 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3237 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003238 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3239 Py_DECREF(output);
3240 return 0;
3241 }
3242 *(PyObject**)addr = output;
3243 return Py_CLEANUP_SUPPORTED;
3244}
3245
3246
Martin v. Löwis5b222132007-06-10 09:51:05 +00003247char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003248PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003249{
Christian Heimesf3863112007-11-22 07:46:41 +00003250 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003251 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
3252
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003253 if (!PyUnicode_Check(unicode)) {
3254 PyErr_BadArgument();
3255 return NULL;
3256 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003257 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003258 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003259
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003260 if (PyUnicode_UTF8(unicode) == NULL) {
3261 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003262 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3263 if (bytes == NULL)
3264 return NULL;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003265 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3266 if (_PyUnicode_UTF8(u) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003267 Py_DECREF(bytes);
3268 return NULL;
3269 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003270 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
3271 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003272 Py_DECREF(bytes);
3273 }
3274
3275 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003276 *psize = PyUnicode_UTF8_LENGTH(unicode);
3277 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003278}
3279
3280char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003281PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003282{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003283 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3284}
3285
3286#ifdef Py_DEBUG
3287int unicode_as_unicode_calls = 0;
3288#endif
3289
3290
3291Py_UNICODE *
3292PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3293{
3294 PyUnicodeObject *u;
3295 const unsigned char *one_byte;
3296#if SIZEOF_WCHAR_T == 4
3297 const Py_UCS2 *two_bytes;
3298#else
3299 const Py_UCS4 *four_bytes;
3300 const Py_UCS4 *ucs4_end;
3301 Py_ssize_t num_surrogates;
3302#endif
3303 wchar_t *w;
3304 wchar_t *wchar_end;
3305
3306 if (!PyUnicode_Check(unicode)) {
3307 PyErr_BadArgument();
3308 return NULL;
3309 }
3310 u = (PyUnicodeObject*)unicode;
3311 if (_PyUnicode_WSTR(u) == NULL) {
3312 /* Non-ASCII compact unicode object */
3313 assert(_PyUnicode_KIND(u) != 0);
3314 assert(PyUnicode_IS_READY(u));
3315
3316#ifdef Py_DEBUG
3317 ++unicode_as_unicode_calls;
3318#endif
3319
3320 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
3321#if SIZEOF_WCHAR_T == 2
3322 four_bytes = PyUnicode_4BYTE_DATA(u);
3323 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
3324 num_surrogates = 0;
3325
3326 for (; four_bytes < ucs4_end; ++four_bytes) {
3327 if (*four_bytes > 0xFFFF)
3328 ++num_surrogates;
3329 }
3330
3331 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
3332 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
3333 if (!_PyUnicode_WSTR(u)) {
3334 PyErr_NoMemory();
3335 return NULL;
3336 }
3337 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
3338
3339 w = _PyUnicode_WSTR(u);
3340 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
3341 four_bytes = PyUnicode_4BYTE_DATA(u);
3342 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3343 if (*four_bytes > 0xFFFF) {
3344 /* encode surrogate pair in this case */
3345 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3346 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3347 }
3348 else
3349 *w = *four_bytes;
3350
3351 if (w > wchar_end) {
3352 assert(0 && "Miscalculated string end");
3353 }
3354 }
3355 *w = 0;
3356#else
3357 /* sizeof(wchar_t) == 4 */
3358 Py_FatalError("Impossible unicode object state, wstr and str "
3359 "should share memory already.");
3360 return NULL;
3361#endif
3362 }
3363 else {
3364 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3365 (_PyUnicode_LENGTH(u) + 1));
3366 if (!_PyUnicode_WSTR(u)) {
3367 PyErr_NoMemory();
3368 return NULL;
3369 }
3370 if (!PyUnicode_IS_COMPACT_ASCII(u))
3371 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
3372 w = _PyUnicode_WSTR(u);
3373 wchar_end = w + _PyUnicode_LENGTH(u);
3374
3375 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
3376 one_byte = PyUnicode_1BYTE_DATA(u);
3377 for (; w < wchar_end; ++one_byte, ++w)
3378 *w = *one_byte;
3379 /* null-terminate the wstr */
3380 *w = 0;
3381 }
3382 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
3383#if SIZEOF_WCHAR_T == 4
3384 two_bytes = PyUnicode_2BYTE_DATA(u);
3385 for (; w < wchar_end; ++two_bytes, ++w)
3386 *w = *two_bytes;
3387 /* null-terminate the wstr */
3388 *w = 0;
3389#else
3390 /* sizeof(wchar_t) == 2 */
3391 PyObject_FREE(_PyUnicode_WSTR(u));
3392 _PyUnicode_WSTR(u) = NULL;
3393 Py_FatalError("Impossible unicode object state, wstr "
3394 "and str should share memory already.");
3395 return NULL;
3396#endif
3397 }
3398 else {
3399 assert(0 && "This should never happen.");
3400 }
3401 }
3402 }
3403 if (size != NULL)
3404 *size = PyUnicode_WSTR_LENGTH(u);
3405 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003406}
3407
Alexander Belopolsky40018472011-02-26 01:02:56 +00003408Py_UNICODE *
3409PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003410{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003411 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003412}
3413
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003414
Alexander Belopolsky40018472011-02-26 01:02:56 +00003415Py_ssize_t
3416PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003417{
3418 if (!PyUnicode_Check(unicode)) {
3419 PyErr_BadArgument();
3420 goto onError;
3421 }
3422 return PyUnicode_GET_SIZE(unicode);
3423
Benjamin Peterson29060642009-01-31 22:14:21 +00003424 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003425 return -1;
3426}
3427
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003428Py_ssize_t
3429PyUnicode_GetLength(PyObject *unicode)
3430{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003431 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003432 PyErr_BadArgument();
3433 return -1;
3434 }
3435
3436 return PyUnicode_GET_LENGTH(unicode);
3437}
3438
3439Py_UCS4
3440PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3441{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003442 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3443 PyErr_BadArgument();
3444 return (Py_UCS4)-1;
3445 }
3446 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3447 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003448 return (Py_UCS4)-1;
3449 }
3450 return PyUnicode_READ_CHAR(unicode, index);
3451}
3452
3453int
3454PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3455{
3456 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003457 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003458 return -1;
3459 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003460 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3461 PyErr_SetString(PyExc_IndexError, "string index out of range");
3462 return -1;
3463 }
3464 if (_PyUnicode_Dirty(unicode))
3465 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003466 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3467 index, ch);
3468 return 0;
3469}
3470
Alexander Belopolsky40018472011-02-26 01:02:56 +00003471const char *
3472PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003473{
Victor Stinner42cb4622010-09-01 19:39:01 +00003474 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003475}
3476
Victor Stinner554f3f02010-06-16 23:33:54 +00003477/* create or adjust a UnicodeDecodeError */
3478static void
3479make_decode_exception(PyObject **exceptionObject,
3480 const char *encoding,
3481 const char *input, Py_ssize_t length,
3482 Py_ssize_t startpos, Py_ssize_t endpos,
3483 const char *reason)
3484{
3485 if (*exceptionObject == NULL) {
3486 *exceptionObject = PyUnicodeDecodeError_Create(
3487 encoding, input, length, startpos, endpos, reason);
3488 }
3489 else {
3490 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3491 goto onError;
3492 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3493 goto onError;
3494 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3495 goto onError;
3496 }
3497 return;
3498
3499onError:
3500 Py_DECREF(*exceptionObject);
3501 *exceptionObject = NULL;
3502}
3503
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003504/* error handling callback helper:
3505 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003506 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003507 and adjust various state variables.
3508 return 0 on success, -1 on error
3509*/
3510
Alexander Belopolsky40018472011-02-26 01:02:56 +00003511static int
3512unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003513 const char *encoding, const char *reason,
3514 const char **input, const char **inend, Py_ssize_t *startinpos,
3515 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3516 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003517{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003518 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003519
3520 PyObject *restuple = NULL;
3521 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003522 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003523 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003524 Py_ssize_t requiredsize;
3525 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003526 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003527 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003528 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003529 int res = -1;
3530
3531 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003532 *errorHandler = PyCodec_LookupError(errors);
3533 if (*errorHandler == NULL)
3534 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003535 }
3536
Victor Stinner554f3f02010-06-16 23:33:54 +00003537 make_decode_exception(exceptionObject,
3538 encoding,
3539 *input, *inend - *input,
3540 *startinpos, *endinpos,
3541 reason);
3542 if (*exceptionObject == NULL)
3543 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003544
3545 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3546 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003547 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003548 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003549 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003550 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003551 }
3552 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003553 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003554
3555 /* Copy back the bytes variables, which might have been modified by the
3556 callback */
3557 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3558 if (!inputobj)
3559 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003560 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003561 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003562 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003563 *input = PyBytes_AS_STRING(inputobj);
3564 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003565 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003566 /* we can DECREF safely, as the exception has another reference,
3567 so the object won't go away. */
3568 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003569
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003570 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003571 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003572 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003573 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3574 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003575 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003576
3577 /* need more space? (at least enough for what we
3578 have+the replacement+the rest of the string (starting
3579 at the new input position), so we won't have to check space
3580 when there are no errors in the rest of the string) */
3581 repptr = PyUnicode_AS_UNICODE(repunicode);
3582 repsize = PyUnicode_GET_SIZE(repunicode);
3583 requiredsize = *outpos + repsize + insize-newpos;
3584 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003585 if (requiredsize<2*outsize)
3586 requiredsize = 2*outsize;
Victor Stinnerfe226c02011-10-03 03:52:20 +02003587 if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003588 goto onError;
3589 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003590 }
3591 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003592 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003593 Py_UNICODE_COPY(*outptr, repptr, repsize);
3594 *outptr += repsize;
3595 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003596
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003597 /* we made it! */
3598 res = 0;
3599
Benjamin Peterson29060642009-01-31 22:14:21 +00003600 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003601 Py_XDECREF(restuple);
3602 return res;
3603}
3604
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003605/* --- UTF-7 Codec -------------------------------------------------------- */
3606
Antoine Pitrou244651a2009-05-04 18:56:13 +00003607/* See RFC2152 for details. We encode conservatively and decode liberally. */
3608
3609/* Three simple macros defining base-64. */
3610
3611/* Is c a base-64 character? */
3612
3613#define IS_BASE64(c) \
3614 (((c) >= 'A' && (c) <= 'Z') || \
3615 ((c) >= 'a' && (c) <= 'z') || \
3616 ((c) >= '0' && (c) <= '9') || \
3617 (c) == '+' || (c) == '/')
3618
3619/* given that c is a base-64 character, what is its base-64 value? */
3620
3621#define FROM_BASE64(c) \
3622 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3623 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3624 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3625 (c) == '+' ? 62 : 63)
3626
3627/* What is the base-64 character of the bottom 6 bits of n? */
3628
3629#define TO_BASE64(n) \
3630 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3631
3632/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3633 * decoded as itself. We are permissive on decoding; the only ASCII
3634 * byte not decoding to itself is the + which begins a base64
3635 * string. */
3636
3637#define DECODE_DIRECT(c) \
3638 ((c) <= 127 && (c) != '+')
3639
3640/* The UTF-7 encoder treats ASCII characters differently according to
3641 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3642 * the above). See RFC2152. This array identifies these different
3643 * sets:
3644 * 0 : "Set D"
3645 * alphanumeric and '(),-./:?
3646 * 1 : "Set O"
3647 * !"#$%&*;<=>@[]^_`{|}
3648 * 2 : "whitespace"
3649 * ht nl cr sp
3650 * 3 : special (must be base64 encoded)
3651 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3652 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003653
Tim Petersced69f82003-09-16 20:30:58 +00003654static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003655char utf7_category[128] = {
3656/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3657 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3658/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3659 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3660/* sp ! " # $ % & ' ( ) * + , - . / */
3661 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3662/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3663 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3664/* @ A B C D E F G H I J K L M N O */
3665 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3666/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3667 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3668/* ` a b c d e f g h i j k l m n o */
3669 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3670/* p q r s t u v w x y z { | } ~ del */
3671 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003672};
3673
Antoine Pitrou244651a2009-05-04 18:56:13 +00003674/* ENCODE_DIRECT: this character should be encoded as itself. The
3675 * answer depends on whether we are encoding set O as itself, and also
3676 * on whether we are encoding whitespace as itself. RFC2152 makes it
3677 * clear that the answers to these questions vary between
3678 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003679
Antoine Pitrou244651a2009-05-04 18:56:13 +00003680#define ENCODE_DIRECT(c, directO, directWS) \
3681 ((c) < 128 && (c) > 0 && \
3682 ((utf7_category[(c)] == 0) || \
3683 (directWS && (utf7_category[(c)] == 2)) || \
3684 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003685
Alexander Belopolsky40018472011-02-26 01:02:56 +00003686PyObject *
3687PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003688 Py_ssize_t size,
3689 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003690{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003691 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3692}
3693
Antoine Pitrou244651a2009-05-04 18:56:13 +00003694/* The decoder. The only state we preserve is our read position,
3695 * i.e. how many characters we have consumed. So if we end in the
3696 * middle of a shift sequence we have to back off the read position
3697 * and the output to the beginning of the sequence, otherwise we lose
3698 * all the shift state (seen bits, number of bits seen, high
3699 * surrogate). */
3700
Alexander Belopolsky40018472011-02-26 01:02:56 +00003701PyObject *
3702PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003703 Py_ssize_t size,
3704 const char *errors,
3705 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003706{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003707 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003708 Py_ssize_t startinpos;
3709 Py_ssize_t endinpos;
3710 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003711 const char *e;
3712 PyUnicodeObject *unicode;
3713 Py_UNICODE *p;
3714 const char *errmsg = "";
3715 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003716 Py_UNICODE *shiftOutStart;
3717 unsigned int base64bits = 0;
3718 unsigned long base64buffer = 0;
3719 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003720 PyObject *errorHandler = NULL;
3721 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003722
3723 unicode = _PyUnicode_New(size);
3724 if (!unicode)
3725 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003726 if (size == 0) {
3727 if (consumed)
3728 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003729 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003730 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003731
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003732 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003733 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003734 e = s + size;
3735
3736 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003737 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003738 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003739 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003740
Antoine Pitrou244651a2009-05-04 18:56:13 +00003741 if (inShift) { /* in a base-64 section */
3742 if (IS_BASE64(ch)) { /* consume a base-64 character */
3743 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3744 base64bits += 6;
3745 s++;
3746 if (base64bits >= 16) {
3747 /* we have enough bits for a UTF-16 value */
3748 Py_UNICODE outCh = (Py_UNICODE)
3749 (base64buffer >> (base64bits-16));
3750 base64bits -= 16;
3751 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3752 if (surrogate) {
3753 /* expecting a second surrogate */
3754 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3755#ifdef Py_UNICODE_WIDE
3756 *p++ = (((surrogate & 0x3FF)<<10)
3757 | (outCh & 0x3FF)) + 0x10000;
3758#else
3759 *p++ = surrogate;
3760 *p++ = outCh;
3761#endif
3762 surrogate = 0;
3763 }
3764 else {
3765 surrogate = 0;
3766 errmsg = "second surrogate missing";
3767 goto utf7Error;
3768 }
3769 }
3770 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3771 /* first surrogate */
3772 surrogate = outCh;
3773 }
3774 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3775 errmsg = "unexpected second surrogate";
3776 goto utf7Error;
3777 }
3778 else {
3779 *p++ = outCh;
3780 }
3781 }
3782 }
3783 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003784 inShift = 0;
3785 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003786 if (surrogate) {
3787 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003788 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003789 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003790 if (base64bits > 0) { /* left-over bits */
3791 if (base64bits >= 6) {
3792 /* We've seen at least one base-64 character */
3793 errmsg = "partial character in shift sequence";
3794 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003795 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003796 else {
3797 /* Some bits remain; they should be zero */
3798 if (base64buffer != 0) {
3799 errmsg = "non-zero padding bits in shift sequence";
3800 goto utf7Error;
3801 }
3802 }
3803 }
3804 if (ch != '-') {
3805 /* '-' is absorbed; other terminating
3806 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003807 *p++ = ch;
3808 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003809 }
3810 }
3811 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003812 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003813 s++; /* consume '+' */
3814 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003815 s++;
3816 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003817 }
3818 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003819 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003820 shiftOutStart = p;
3821 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003822 }
3823 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003824 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003825 *p++ = ch;
3826 s++;
3827 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003828 else {
3829 startinpos = s-starts;
3830 s++;
3831 errmsg = "unexpected special character";
3832 goto utf7Error;
3833 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003834 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003835utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003836 outpos = p-PyUnicode_AS_UNICODE(unicode);
3837 endinpos = s-starts;
3838 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003839 errors, &errorHandler,
3840 "utf7", errmsg,
3841 &starts, &e, &startinpos, &endinpos, &exc, &s,
3842 &unicode, &outpos, &p))
3843 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003844 }
3845
Antoine Pitrou244651a2009-05-04 18:56:13 +00003846 /* end of string */
3847
3848 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3849 /* if we're in an inconsistent state, that's an error */
3850 if (surrogate ||
3851 (base64bits >= 6) ||
3852 (base64bits > 0 && base64buffer != 0)) {
3853 outpos = p-PyUnicode_AS_UNICODE(unicode);
3854 endinpos = size;
3855 if (unicode_decode_call_errorhandler(
3856 errors, &errorHandler,
3857 "utf7", "unterminated shift sequence",
3858 &starts, &e, &startinpos, &endinpos, &exc, &s,
3859 &unicode, &outpos, &p))
3860 goto onError;
3861 if (s < e)
3862 goto restart;
3863 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003864 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003865
3866 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003867 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003868 if (inShift) {
3869 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003870 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003871 }
3872 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003873 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003874 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003875 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003876
Victor Stinnerfe226c02011-10-03 03:52:20 +02003877 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003878 goto onError;
3879
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003880 Py_XDECREF(errorHandler);
3881 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02003882#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02003883 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003884 Py_DECREF(unicode);
3885 return NULL;
3886 }
Victor Stinner17efeed2011-10-04 20:05:46 +02003887#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02003888 assert(_PyUnicode_CheckConsistency(unicode, 1));
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003889 return (PyObject *)unicode;
3890
Benjamin Peterson29060642009-01-31 22:14:21 +00003891 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003892 Py_XDECREF(errorHandler);
3893 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003894 Py_DECREF(unicode);
3895 return NULL;
3896}
3897
3898
Alexander Belopolsky40018472011-02-26 01:02:56 +00003899PyObject *
3900PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003901 Py_ssize_t size,
3902 int base64SetO,
3903 int base64WhiteSpace,
3904 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003905{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003906 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003907 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003908 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003909 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003910 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003911 unsigned int base64bits = 0;
3912 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003913 char * out;
3914 char * start;
3915
3916 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003917 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003918
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003919 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003920 return PyErr_NoMemory();
3921
Antoine Pitrou244651a2009-05-04 18:56:13 +00003922 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003923 if (v == NULL)
3924 return NULL;
3925
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003926 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003927 for (;i < size; ++i) {
3928 Py_UNICODE ch = s[i];
3929
Antoine Pitrou244651a2009-05-04 18:56:13 +00003930 if (inShift) {
3931 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3932 /* shifting out */
3933 if (base64bits) { /* output remaining bits */
3934 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3935 base64buffer = 0;
3936 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003937 }
3938 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003939 /* Characters not in the BASE64 set implicitly unshift the sequence
3940 so no '-' is required, except if the character is itself a '-' */
3941 if (IS_BASE64(ch) || ch == '-') {
3942 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003943 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003944 *out++ = (char) ch;
3945 }
3946 else {
3947 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003948 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003949 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003950 else { /* not in a shift sequence */
3951 if (ch == '+') {
3952 *out++ = '+';
3953 *out++ = '-';
3954 }
3955 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3956 *out++ = (char) ch;
3957 }
3958 else {
3959 *out++ = '+';
3960 inShift = 1;
3961 goto encode_char;
3962 }
3963 }
3964 continue;
3965encode_char:
3966#ifdef Py_UNICODE_WIDE
3967 if (ch >= 0x10000) {
3968 /* code first surrogate */
3969 base64bits += 16;
3970 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3971 while (base64bits >= 6) {
3972 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3973 base64bits -= 6;
3974 }
3975 /* prepare second surrogate */
3976 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
3977 }
3978#endif
3979 base64bits += 16;
3980 base64buffer = (base64buffer << 16) | ch;
3981 while (base64bits >= 6) {
3982 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3983 base64bits -= 6;
3984 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00003985 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003986 if (base64bits)
3987 *out++= TO_BASE64(base64buffer << (6-base64bits) );
3988 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003989 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003990 if (_PyBytes_Resize(&v, out - start) < 0)
3991 return NULL;
3992 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003993}
3994
Antoine Pitrou244651a2009-05-04 18:56:13 +00003995#undef IS_BASE64
3996#undef FROM_BASE64
3997#undef TO_BASE64
3998#undef DECODE_DIRECT
3999#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004000
Guido van Rossumd57fd912000-03-10 22:53:23 +00004001/* --- UTF-8 Codec -------------------------------------------------------- */
4002
Tim Petersced69f82003-09-16 20:30:58 +00004003static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004004char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004005 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4006 illegal prefix. See RFC 3629 for details */
4007 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4008 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004009 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004010 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4011 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4012 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4013 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004014 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4015 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004016 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4017 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004018 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4019 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4020 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4021 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4022 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004023};
4024
Alexander Belopolsky40018472011-02-26 01:02:56 +00004025PyObject *
4026PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004027 Py_ssize_t size,
4028 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004029{
Walter Dörwald69652032004-09-07 20:24:22 +00004030 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4031}
4032
Antoine Pitrouab868312009-01-10 15:40:25 +00004033/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4034#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4035
4036/* Mask to quickly check whether a C 'long' contains a
4037 non-ASCII, UTF8-encoded char. */
4038#if (SIZEOF_LONG == 8)
4039# define ASCII_CHAR_MASK 0x8080808080808080L
4040#elif (SIZEOF_LONG == 4)
4041# define ASCII_CHAR_MASK 0x80808080L
4042#else
4043# error C 'long' size should be either 4 or 8!
4044#endif
4045
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004046/* Scans a UTF-8 string and returns the maximum character to be expected,
4047 the size of the decoded unicode string and if any major errors were
4048 encountered.
4049
4050 This function does check basic UTF-8 sanity, it does however NOT CHECK
4051 if the string contains surrogates, and if all continuation bytes are
4052 within the correct ranges, these checks are performed in
4053 PyUnicode_DecodeUTF8Stateful.
4054
4055 If it sets has_errors to 1, it means the value of unicode_size and max_char
4056 will be bogus and you should not rely on useful information in them.
4057 */
4058static Py_UCS4
4059utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
4060 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
4061 int *has_errors)
4062{
4063 Py_ssize_t n;
4064 Py_ssize_t char_count = 0;
4065 Py_UCS4 max_char = 127, new_max;
4066 Py_UCS4 upper_bound;
4067 const unsigned char *p = (const unsigned char *)s;
4068 const unsigned char *end = p + string_size;
4069 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
4070 int err = 0;
4071
4072 for (; p < end && !err; ++p, ++char_count) {
4073 /* Only check value if it's not a ASCII char... */
4074 if (*p < 0x80) {
4075 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4076 an explanation. */
4077 if (!((size_t) p & LONG_PTR_MASK)) {
4078 /* Help register allocation */
4079 register const unsigned char *_p = p;
4080 while (_p < aligned_end) {
4081 unsigned long value = *(unsigned long *) _p;
4082 if (value & ASCII_CHAR_MASK)
4083 break;
4084 _p += SIZEOF_LONG;
4085 char_count += SIZEOF_LONG;
4086 }
4087 p = _p;
4088 if (p == end)
4089 break;
4090 }
4091 }
4092 if (*p >= 0x80) {
4093 n = utf8_code_length[*p];
4094 new_max = max_char;
4095 switch (n) {
4096 /* invalid start byte */
4097 case 0:
4098 err = 1;
4099 break;
4100 case 2:
4101 /* Code points between 0x00FF and 0x07FF inclusive.
4102 Approximate the upper bound of the code point,
4103 if this flips over 255 we can be sure it will be more
4104 than 255 and the string will need 2 bytes per code coint,
4105 if it stays under or equal to 255, we can be sure 1 byte
4106 is enough.
4107 ((*p & 0b00011111) << 6) | 0b00111111 */
4108 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
4109 if (max_char < upper_bound)
4110 new_max = upper_bound;
4111 /* Ensure we track at least that we left ASCII space. */
4112 if (new_max < 128)
4113 new_max = 128;
4114 break;
4115 case 3:
4116 /* Between 0x0FFF and 0xFFFF inclusive, so values are
4117 always > 255 and <= 65535 and will always need 2 bytes. */
4118 if (max_char < 65535)
4119 new_max = 65535;
4120 break;
4121 case 4:
4122 /* Code point will be above 0xFFFF for sure in this case. */
4123 new_max = 65537;
4124 break;
4125 /* Internal error, this should be caught by the first if */
4126 case 1:
4127 default:
4128 assert(0 && "Impossible case in utf8_max_char_and_size");
4129 err = 1;
4130 }
4131 /* Instead of number of overall bytes for this code point,
Georg Brandl7597add2011-10-05 16:36:47 +02004132 n contains the number of following bytes: */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004133 --n;
4134 /* Check if the follow up chars are all valid continuation bytes */
4135 if (n >= 1) {
4136 const unsigned char *cont;
4137 if ((p + n) >= end) {
4138 if (consumed == 0)
4139 /* incomplete data, non-incremental decoding */
4140 err = 1;
4141 break;
4142 }
4143 for (cont = p + 1; cont < (p + n); ++cont) {
4144 if ((*cont & 0xc0) != 0x80) {
4145 err = 1;
4146 break;
4147 }
4148 }
4149 p += n;
4150 }
4151 else
4152 err = 1;
4153 max_char = new_max;
4154 }
4155 }
4156
4157 if (unicode_size)
4158 *unicode_size = char_count;
4159 if (has_errors)
4160 *has_errors = err;
4161 return max_char;
4162}
4163
4164/* Similar to PyUnicode_WRITE but can also write into wstr field
4165 of the legacy unicode representation */
4166#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
4167 do { \
4168 const int k_ = (kind); \
4169 if (k_ == PyUnicode_WCHAR_KIND) \
4170 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
4171 else if (k_ == PyUnicode_1BYTE_KIND) \
4172 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
4173 else if (k_ == PyUnicode_2BYTE_KIND) \
4174 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
4175 else \
4176 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
4177 } while (0)
4178
Alexander Belopolsky40018472011-02-26 01:02:56 +00004179PyObject *
4180PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004181 Py_ssize_t size,
4182 const char *errors,
4183 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004184{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004185 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004186 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004187 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004188 Py_ssize_t startinpos;
4189 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00004190 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004191 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004192 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004193 PyObject *errorHandler = NULL;
4194 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004195 Py_UCS4 maxchar = 0;
4196 Py_ssize_t unicode_size;
4197 Py_ssize_t i;
4198 int kind;
4199 void *data;
4200 int has_errors;
4201 Py_UNICODE *error_outptr;
4202#if SIZEOF_WCHAR_T == 2
4203 Py_ssize_t wchar_offset = 0;
4204#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004205
Walter Dörwald69652032004-09-07 20:24:22 +00004206 if (size == 0) {
4207 if (consumed)
4208 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004209 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00004210 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004211 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
4212 consumed, &has_errors);
4213 if (has_errors) {
4214 unicode = _PyUnicode_New(size);
4215 if (!unicode)
4216 return NULL;
4217 kind = PyUnicode_WCHAR_KIND;
4218 data = PyUnicode_AS_UNICODE(unicode);
4219 assert(data != NULL);
4220 }
4221 else {
4222 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
4223 if (!unicode)
4224 return NULL;
4225 /* When the string is ASCII only, just use memcpy and return.
4226 unicode_size may be != size if there is an incomplete UTF-8
4227 sequence at the end of the ASCII block. */
4228 if (maxchar < 128 && size == unicode_size) {
4229 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
4230 return (PyObject *)unicode;
4231 }
4232 kind = PyUnicode_KIND(unicode);
4233 data = PyUnicode_DATA(unicode);
4234 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004235 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004236 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004237 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00004238 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004239
4240 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004241 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004242
4243 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004244 /* Fast path for runs of ASCII characters. Given that common UTF-8
4245 input will consist of an overwhelming majority of ASCII
4246 characters, we try to optimize for this case by checking
4247 as many characters as a C 'long' can contain.
4248 First, check if we can do an aligned read, as most CPUs have
4249 a penalty for unaligned reads.
4250 */
4251 if (!((size_t) s & LONG_PTR_MASK)) {
4252 /* Help register allocation */
4253 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004254 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004255 while (_s < aligned_end) {
4256 /* Read a whole long at a time (either 4 or 8 bytes),
4257 and do a fast unrolled copy if it only contains ASCII
4258 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004259 unsigned long value = *(unsigned long *) _s;
4260 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004261 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004262 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
4263 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
4264 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
4265 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004266#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004267 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
4268 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
4269 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
4270 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004271#endif
4272 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004273 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004274 }
4275 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004276 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004277 if (s == e)
4278 break;
4279 ch = (unsigned char)*s;
4280 }
4281 }
4282
4283 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004284 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004285 s++;
4286 continue;
4287 }
4288
4289 n = utf8_code_length[ch];
4290
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004291 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004292 if (consumed)
4293 break;
4294 else {
4295 errmsg = "unexpected end of data";
4296 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004297 endinpos = startinpos+1;
4298 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4299 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004300 goto utf8Error;
4301 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004302 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004303
4304 switch (n) {
4305
4306 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004307 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004308 startinpos = s-starts;
4309 endinpos = startinpos+1;
4310 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004311
4312 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004313 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004314 startinpos = s-starts;
4315 endinpos = startinpos+1;
4316 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004317
4318 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004319 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004320 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004321 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004322 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004323 goto utf8Error;
4324 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004325 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004326 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004327 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004328 break;
4329
4330 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004331 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4332 will result in surrogates in range d800-dfff. Surrogates are
4333 not valid UTF-8 so they are rejected.
4334 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4335 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004336 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004337 (s[2] & 0xc0) != 0x80 ||
4338 ((unsigned char)s[0] == 0xE0 &&
4339 (unsigned char)s[1] < 0xA0) ||
4340 ((unsigned char)s[0] == 0xED &&
4341 (unsigned char)s[1] > 0x9F)) {
4342 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004343 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004344 endinpos = startinpos + 1;
4345
4346 /* if s[1] first two bits are 1 and 0, then the invalid
4347 continuation byte is s[2], so increment endinpos by 1,
4348 if not, s[1] is invalid and endinpos doesn't need to
4349 be incremented. */
4350 if ((s[1] & 0xC0) == 0x80)
4351 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004352 goto utf8Error;
4353 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004354 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004355 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004356 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004357 break;
4358
4359 case 4:
4360 if ((s[1] & 0xc0) != 0x80 ||
4361 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004362 (s[3] & 0xc0) != 0x80 ||
4363 ((unsigned char)s[0] == 0xF0 &&
4364 (unsigned char)s[1] < 0x90) ||
4365 ((unsigned char)s[0] == 0xF4 &&
4366 (unsigned char)s[1] > 0x8F)) {
4367 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004368 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004369 endinpos = startinpos + 1;
4370 if ((s[1] & 0xC0) == 0x80) {
4371 endinpos++;
4372 if ((s[2] & 0xC0) == 0x80)
4373 endinpos++;
4374 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004375 goto utf8Error;
4376 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004377 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004378 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4379 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4380
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004381 /* If the string is flexible or we have native UCS-4, write
4382 directly.. */
4383 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
4384 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00004385
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004386 else {
4387 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00004388
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004389 /* translate from 10000..10FFFF to 0..FFFF */
4390 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00004391
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004392 /* high surrogate = top 10 bits added to D800 */
4393 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4394 (Py_UNICODE)(0xD800 + (ch >> 10)));
4395
4396 /* low surrogate = bottom 10 bits added to DC00 */
4397 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4398 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
4399 }
4400#if SIZEOF_WCHAR_T == 2
4401 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004402#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004403 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004404 }
4405 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004406 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004407
Benjamin Peterson29060642009-01-31 22:14:21 +00004408 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004409 /* If this is not yet a resizable string, make it one.. */
4410 if (kind != PyUnicode_WCHAR_KIND) {
4411 const Py_UNICODE *u;
4412 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
4413 if (!new_unicode)
4414 goto onError;
4415 u = PyUnicode_AsUnicode((PyObject *)unicode);
4416 if (!u)
4417 goto onError;
4418#if SIZEOF_WCHAR_T == 2
4419 i += wchar_offset;
4420#endif
4421 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
4422 Py_DECREF(unicode);
4423 unicode = new_unicode;
4424 kind = 0;
4425 data = PyUnicode_AS_UNICODE(new_unicode);
4426 assert(data != NULL);
4427 }
4428 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00004429 if (unicode_decode_call_errorhandler(
4430 errors, &errorHandler,
4431 "utf8", errmsg,
4432 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004433 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00004434 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004435 /* Update data because unicode_decode_call_errorhandler might have
4436 re-created or resized the unicode object. */
4437 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004438 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004439 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004440 /* Ensure the unicode_size calculation above was correct: */
4441 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
4442
Walter Dörwald69652032004-09-07 20:24:22 +00004443 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004444 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004445
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004446 /* Adjust length and ready string when it contained errors and
4447 is of the old resizable kind. */
4448 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004449 if (PyUnicode_Resize((PyObject**)&unicode, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004450 goto onError;
4451 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004452
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004453 Py_XDECREF(errorHandler);
4454 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02004455#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004456 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004457 Py_DECREF(unicode);
4458 return NULL;
4459 }
Victor Stinner17efeed2011-10-04 20:05:46 +02004460#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004461 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004462 return (PyObject *)unicode;
4463
Benjamin Peterson29060642009-01-31 22:14:21 +00004464 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004465 Py_XDECREF(errorHandler);
4466 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004467 Py_DECREF(unicode);
4468 return NULL;
4469}
4470
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004471#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00004472
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004473#ifdef __APPLE__
4474
4475/* Simplified UTF-8 decoder using surrogateescape error handler,
4476 used to decode the command line arguments on Mac OS X. */
4477
4478wchar_t*
4479_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4480{
4481 int n;
4482 const char *e;
4483 wchar_t *unicode, *p;
4484
4485 /* Note: size will always be longer than the resulting Unicode
4486 character count */
4487 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4488 PyErr_NoMemory();
4489 return NULL;
4490 }
4491 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4492 if (!unicode)
4493 return NULL;
4494
4495 /* Unpack UTF-8 encoded data */
4496 p = unicode;
4497 e = s + size;
4498 while (s < e) {
4499 Py_UCS4 ch = (unsigned char)*s;
4500
4501 if (ch < 0x80) {
4502 *p++ = (wchar_t)ch;
4503 s++;
4504 continue;
4505 }
4506
4507 n = utf8_code_length[ch];
4508 if (s + n > e) {
4509 goto surrogateescape;
4510 }
4511
4512 switch (n) {
4513 case 0:
4514 case 1:
4515 goto surrogateescape;
4516
4517 case 2:
4518 if ((s[1] & 0xc0) != 0x80)
4519 goto surrogateescape;
4520 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4521 assert ((ch > 0x007F) && (ch <= 0x07FF));
4522 *p++ = (wchar_t)ch;
4523 break;
4524
4525 case 3:
4526 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4527 will result in surrogates in range d800-dfff. Surrogates are
4528 not valid UTF-8 so they are rejected.
4529 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4530 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4531 if ((s[1] & 0xc0) != 0x80 ||
4532 (s[2] & 0xc0) != 0x80 ||
4533 ((unsigned char)s[0] == 0xE0 &&
4534 (unsigned char)s[1] < 0xA0) ||
4535 ((unsigned char)s[0] == 0xED &&
4536 (unsigned char)s[1] > 0x9F)) {
4537
4538 goto surrogateescape;
4539 }
4540 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4541 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004542 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004543 break;
4544
4545 case 4:
4546 if ((s[1] & 0xc0) != 0x80 ||
4547 (s[2] & 0xc0) != 0x80 ||
4548 (s[3] & 0xc0) != 0x80 ||
4549 ((unsigned char)s[0] == 0xF0 &&
4550 (unsigned char)s[1] < 0x90) ||
4551 ((unsigned char)s[0] == 0xF4 &&
4552 (unsigned char)s[1] > 0x8F)) {
4553 goto surrogateescape;
4554 }
4555 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4556 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4557 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4558
4559#if SIZEOF_WCHAR_T == 4
4560 *p++ = (wchar_t)ch;
4561#else
4562 /* compute and append the two surrogates: */
4563
4564 /* translate from 10000..10FFFF to 0..FFFF */
4565 ch -= 0x10000;
4566
4567 /* high surrogate = top 10 bits added to D800 */
4568 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4569
4570 /* low surrogate = bottom 10 bits added to DC00 */
4571 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4572#endif
4573 break;
4574 }
4575 s += n;
4576 continue;
4577
4578 surrogateescape:
4579 *p++ = 0xDC00 + ch;
4580 s++;
4581 }
4582 *p = L'\0';
4583 return unicode;
4584}
4585
4586#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004587
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004588/* Primary internal function which creates utf8 encoded bytes objects.
4589
4590 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004591 and allocate exactly as much space needed at the end. Else allocate the
4592 maximum possible needed (4 result bytes per Unicode character), and return
4593 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004594*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004595PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004596_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004597{
Tim Peters602f7402002-04-27 18:03:26 +00004598#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004599
Guido van Rossum98297ee2007-11-06 21:34:58 +00004600 Py_ssize_t i; /* index into s of next input byte */
4601 PyObject *result; /* result string object */
4602 char *p; /* next free byte in output buffer */
4603 Py_ssize_t nallocated; /* number of result bytes allocated */
4604 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004605 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004606 PyObject *errorHandler = NULL;
4607 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004608 int kind;
4609 void *data;
4610 Py_ssize_t size;
4611 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4612#if SIZEOF_WCHAR_T == 2
4613 Py_ssize_t wchar_offset = 0;
4614#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004615
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004616 if (!PyUnicode_Check(unicode)) {
4617 PyErr_BadArgument();
4618 return NULL;
4619 }
4620
4621 if (PyUnicode_READY(unicode) == -1)
4622 return NULL;
4623
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004624 if (PyUnicode_UTF8(unicode))
4625 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4626 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004627
4628 kind = PyUnicode_KIND(unicode);
4629 data = PyUnicode_DATA(unicode);
4630 size = PyUnicode_GET_LENGTH(unicode);
4631
Tim Peters602f7402002-04-27 18:03:26 +00004632 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004633
Tim Peters602f7402002-04-27 18:03:26 +00004634 if (size <= MAX_SHORT_UNICHARS) {
4635 /* Write into the stack buffer; nallocated can't overflow.
4636 * At the end, we'll allocate exactly as much heap space as it
4637 * turns out we need.
4638 */
4639 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004640 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004641 p = stackbuf;
4642 }
4643 else {
4644 /* Overallocate on the heap, and give the excess back at the end. */
4645 nallocated = size * 4;
4646 if (nallocated / 4 != size) /* overflow! */
4647 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004648 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004649 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004650 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004651 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004652 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004653
Tim Peters602f7402002-04-27 18:03:26 +00004654 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004655 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004656
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004657 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004658 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004659 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004660
Guido van Rossumd57fd912000-03-10 22:53:23 +00004661 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004662 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004663 *p++ = (char)(0xc0 | (ch >> 6));
4664 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004665 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004666 Py_ssize_t newpos;
4667 PyObject *rep;
4668 Py_ssize_t repsize, k, startpos;
4669 startpos = i-1;
4670#if SIZEOF_WCHAR_T == 2
4671 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004672#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004673 rep = unicode_encode_call_errorhandler(
4674 errors, &errorHandler, "utf-8", "surrogates not allowed",
4675 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4676 &exc, startpos, startpos+1, &newpos);
4677 if (!rep)
4678 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004679
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004680 if (PyBytes_Check(rep))
4681 repsize = PyBytes_GET_SIZE(rep);
4682 else
4683 repsize = PyUnicode_GET_SIZE(rep);
4684
4685 if (repsize > 4) {
4686 Py_ssize_t offset;
4687
4688 if (result == NULL)
4689 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004690 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004691 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004692
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004693 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4694 /* integer overflow */
4695 PyErr_NoMemory();
4696 goto error;
4697 }
4698 nallocated += repsize - 4;
4699 if (result != NULL) {
4700 if (_PyBytes_Resize(&result, nallocated) < 0)
4701 goto error;
4702 } else {
4703 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004704 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004705 goto error;
4706 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4707 }
4708 p = PyBytes_AS_STRING(result) + offset;
4709 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004710
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004711 if (PyBytes_Check(rep)) {
4712 char *prep = PyBytes_AS_STRING(rep);
4713 for(k = repsize; k > 0; k--)
4714 *p++ = *prep++;
4715 } else /* rep is unicode */ {
4716 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4717 Py_UNICODE c;
4718
4719 for(k=0; k<repsize; k++) {
4720 c = prep[k];
4721 if (0x80 <= c) {
4722 raise_encode_exception(&exc, "utf-8",
4723 PyUnicode_AS_UNICODE(unicode),
4724 size, i-1, i,
4725 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004726 goto error;
4727 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004728 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004729 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004730 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004731 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004732 } else if (ch < 0x10000) {
4733 *p++ = (char)(0xe0 | (ch >> 12));
4734 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4735 *p++ = (char)(0x80 | (ch & 0x3f));
4736 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004737 /* Encode UCS4 Unicode ordinals */
4738 *p++ = (char)(0xf0 | (ch >> 18));
4739 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4740 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4741 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004742#if SIZEOF_WCHAR_T == 2
4743 wchar_offset++;
4744#endif
Tim Peters602f7402002-04-27 18:03:26 +00004745 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004746 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004747
Guido van Rossum98297ee2007-11-06 21:34:58 +00004748 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004749 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004750 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004751 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004752 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004753 }
4754 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004755 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004756 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004757 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004758 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004759 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004760
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004761 Py_XDECREF(errorHandler);
4762 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004763 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004764 error:
4765 Py_XDECREF(errorHandler);
4766 Py_XDECREF(exc);
4767 Py_XDECREF(result);
4768 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004769
Tim Peters602f7402002-04-27 18:03:26 +00004770#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004771}
4772
Alexander Belopolsky40018472011-02-26 01:02:56 +00004773PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004774PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4775 Py_ssize_t size,
4776 const char *errors)
4777{
4778 PyObject *v, *unicode;
4779
4780 unicode = PyUnicode_FromUnicode(s, size);
4781 if (unicode == NULL)
4782 return NULL;
4783 v = _PyUnicode_AsUTF8String(unicode, errors);
4784 Py_DECREF(unicode);
4785 return v;
4786}
4787
4788PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004789PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004790{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004791 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004792}
4793
Walter Dörwald41980ca2007-08-16 21:55:45 +00004794/* --- UTF-32 Codec ------------------------------------------------------- */
4795
4796PyObject *
4797PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004798 Py_ssize_t size,
4799 const char *errors,
4800 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004801{
4802 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4803}
4804
4805PyObject *
4806PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004807 Py_ssize_t size,
4808 const char *errors,
4809 int *byteorder,
4810 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004811{
4812 const char *starts = s;
4813 Py_ssize_t startinpos;
4814 Py_ssize_t endinpos;
4815 Py_ssize_t outpos;
4816 PyUnicodeObject *unicode;
4817 Py_UNICODE *p;
4818#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004819 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004820 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004821#else
4822 const int pairs = 0;
4823#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004824 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004825 int bo = 0; /* assume native ordering by default */
4826 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004827 /* Offsets from q for retrieving bytes in the right order. */
4828#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4829 int iorder[] = {0, 1, 2, 3};
4830#else
4831 int iorder[] = {3, 2, 1, 0};
4832#endif
4833 PyObject *errorHandler = NULL;
4834 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004835
Walter Dörwald41980ca2007-08-16 21:55:45 +00004836 q = (unsigned char *)s;
4837 e = q + size;
4838
4839 if (byteorder)
4840 bo = *byteorder;
4841
4842 /* Check for BOM marks (U+FEFF) in the input and adjust current
4843 byte order setting accordingly. In native mode, the leading BOM
4844 mark is skipped, in all other modes, it is copied to the output
4845 stream as-is (giving a ZWNBSP character). */
4846 if (bo == 0) {
4847 if (size >= 4) {
4848 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004849 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004850#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004851 if (bom == 0x0000FEFF) {
4852 q += 4;
4853 bo = -1;
4854 }
4855 else if (bom == 0xFFFE0000) {
4856 q += 4;
4857 bo = 1;
4858 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004859#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004860 if (bom == 0x0000FEFF) {
4861 q += 4;
4862 bo = 1;
4863 }
4864 else if (bom == 0xFFFE0000) {
4865 q += 4;
4866 bo = -1;
4867 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004868#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004869 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004870 }
4871
4872 if (bo == -1) {
4873 /* force LE */
4874 iorder[0] = 0;
4875 iorder[1] = 1;
4876 iorder[2] = 2;
4877 iorder[3] = 3;
4878 }
4879 else if (bo == 1) {
4880 /* force BE */
4881 iorder[0] = 3;
4882 iorder[1] = 2;
4883 iorder[2] = 1;
4884 iorder[3] = 0;
4885 }
4886
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004887 /* On narrow builds we split characters outside the BMP into two
4888 codepoints => count how much extra space we need. */
4889#ifndef Py_UNICODE_WIDE
4890 for (qq = q; qq < e; qq += 4)
4891 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4892 pairs++;
4893#endif
4894
4895 /* This might be one to much, because of a BOM */
4896 unicode = _PyUnicode_New((size+3)/4+pairs);
4897 if (!unicode)
4898 return NULL;
4899 if (size == 0)
4900 return (PyObject *)unicode;
4901
4902 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004903 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004904
Walter Dörwald41980ca2007-08-16 21:55:45 +00004905 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004906 Py_UCS4 ch;
4907 /* remaining bytes at the end? (size should be divisible by 4) */
4908 if (e-q<4) {
4909 if (consumed)
4910 break;
4911 errmsg = "truncated data";
4912 startinpos = ((const char *)q)-starts;
4913 endinpos = ((const char *)e)-starts;
4914 goto utf32Error;
4915 /* The remaining input chars are ignored if the callback
4916 chooses to skip the input */
4917 }
4918 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4919 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004920
Benjamin Peterson29060642009-01-31 22:14:21 +00004921 if (ch >= 0x110000)
4922 {
4923 errmsg = "codepoint not in range(0x110000)";
4924 startinpos = ((const char *)q)-starts;
4925 endinpos = startinpos+4;
4926 goto utf32Error;
4927 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004928#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004929 if (ch >= 0x10000)
4930 {
4931 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4932 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4933 }
4934 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004935#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004936 *p++ = ch;
4937 q += 4;
4938 continue;
4939 utf32Error:
4940 outpos = p-PyUnicode_AS_UNICODE(unicode);
4941 if (unicode_decode_call_errorhandler(
4942 errors, &errorHandler,
4943 "utf32", errmsg,
4944 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4945 &unicode, &outpos, &p))
4946 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004947 }
4948
4949 if (byteorder)
4950 *byteorder = bo;
4951
4952 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004953 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004954
4955 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02004956 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004957 goto onError;
4958
4959 Py_XDECREF(errorHandler);
4960 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02004961#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004962 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004963 Py_DECREF(unicode);
4964 return NULL;
4965 }
Victor Stinner17efeed2011-10-04 20:05:46 +02004966#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004967 assert(_PyUnicode_CheckConsistency(unicode, 1));
Walter Dörwald41980ca2007-08-16 21:55:45 +00004968 return (PyObject *)unicode;
4969
Benjamin Peterson29060642009-01-31 22:14:21 +00004970 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004971 Py_DECREF(unicode);
4972 Py_XDECREF(errorHandler);
4973 Py_XDECREF(exc);
4974 return NULL;
4975}
4976
4977PyObject *
4978PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004979 Py_ssize_t size,
4980 const char *errors,
4981 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004982{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004983 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004984 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004985 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004986#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004987 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004988#else
4989 const int pairs = 0;
4990#endif
4991 /* Offsets from p for storing byte pairs in the right order. */
4992#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4993 int iorder[] = {0, 1, 2, 3};
4994#else
4995 int iorder[] = {3, 2, 1, 0};
4996#endif
4997
Benjamin Peterson29060642009-01-31 22:14:21 +00004998#define STORECHAR(CH) \
4999 do { \
5000 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5001 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5002 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5003 p[iorder[0]] = (CH) & 0xff; \
5004 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005005 } while(0)
5006
5007 /* In narrow builds we can output surrogate pairs as one codepoint,
5008 so we need less space. */
5009#ifndef Py_UNICODE_WIDE
5010 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005011 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
5012 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
5013 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005014#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005015 nsize = (size - pairs + (byteorder == 0));
5016 bytesize = nsize * 4;
5017 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005018 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005019 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005020 if (v == NULL)
5021 return NULL;
5022
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005023 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005024 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005025 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005026 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005027 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005028
5029 if (byteorder == -1) {
5030 /* force LE */
5031 iorder[0] = 0;
5032 iorder[1] = 1;
5033 iorder[2] = 2;
5034 iorder[3] = 3;
5035 }
5036 else if (byteorder == 1) {
5037 /* force BE */
5038 iorder[0] = 3;
5039 iorder[1] = 2;
5040 iorder[2] = 1;
5041 iorder[3] = 0;
5042 }
5043
5044 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005045 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005046#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005047 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
5048 Py_UCS4 ch2 = *s;
5049 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
5050 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
5051 s++;
5052 size--;
5053 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005054 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005055#endif
5056 STORECHAR(ch);
5057 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005058
5059 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005060 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005061#undef STORECHAR
5062}
5063
Alexander Belopolsky40018472011-02-26 01:02:56 +00005064PyObject *
5065PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005066{
5067 if (!PyUnicode_Check(unicode)) {
5068 PyErr_BadArgument();
5069 return NULL;
5070 }
5071 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005072 PyUnicode_GET_SIZE(unicode),
5073 NULL,
5074 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005075}
5076
Guido van Rossumd57fd912000-03-10 22:53:23 +00005077/* --- UTF-16 Codec ------------------------------------------------------- */
5078
Tim Peters772747b2001-08-09 22:21:55 +00005079PyObject *
5080PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005081 Py_ssize_t size,
5082 const char *errors,
5083 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005084{
Walter Dörwald69652032004-09-07 20:24:22 +00005085 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5086}
5087
Antoine Pitrouab868312009-01-10 15:40:25 +00005088/* Two masks for fast checking of whether a C 'long' may contain
5089 UTF16-encoded surrogate characters. This is an efficient heuristic,
5090 assuming that non-surrogate characters with a code point >= 0x8000 are
5091 rare in most input.
5092 FAST_CHAR_MASK is used when the input is in native byte ordering,
5093 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005094*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005095#if (SIZEOF_LONG == 8)
5096# define FAST_CHAR_MASK 0x8000800080008000L
5097# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
5098#elif (SIZEOF_LONG == 4)
5099# define FAST_CHAR_MASK 0x80008000L
5100# define SWAPPED_FAST_CHAR_MASK 0x00800080L
5101#else
5102# error C 'long' size should be either 4 or 8!
5103#endif
5104
Walter Dörwald69652032004-09-07 20:24:22 +00005105PyObject *
5106PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005107 Py_ssize_t size,
5108 const char *errors,
5109 int *byteorder,
5110 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005111{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005112 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005113 Py_ssize_t startinpos;
5114 Py_ssize_t endinpos;
5115 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005116 PyUnicodeObject *unicode;
5117 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00005118 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005119 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005120 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005121 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005122 /* Offsets from q for retrieving byte pairs in the right order. */
5123#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5124 int ihi = 1, ilo = 0;
5125#else
5126 int ihi = 0, ilo = 1;
5127#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005128 PyObject *errorHandler = NULL;
5129 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005130
5131 /* Note: size will always be longer than the resulting Unicode
5132 character count */
5133 unicode = _PyUnicode_New(size);
5134 if (!unicode)
5135 return NULL;
5136 if (size == 0)
5137 return (PyObject *)unicode;
5138
5139 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005140 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00005141 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005142 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005143
5144 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005145 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005146
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005147 /* Check for BOM marks (U+FEFF) in the input and adjust current
5148 byte order setting accordingly. In native mode, the leading BOM
5149 mark is skipped, in all other modes, it is copied to the output
5150 stream as-is (giving a ZWNBSP character). */
5151 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005152 if (size >= 2) {
5153 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005154#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005155 if (bom == 0xFEFF) {
5156 q += 2;
5157 bo = -1;
5158 }
5159 else if (bom == 0xFFFE) {
5160 q += 2;
5161 bo = 1;
5162 }
Tim Petersced69f82003-09-16 20:30:58 +00005163#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005164 if (bom == 0xFEFF) {
5165 q += 2;
5166 bo = 1;
5167 }
5168 else if (bom == 0xFFFE) {
5169 q += 2;
5170 bo = -1;
5171 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005172#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005173 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005174 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005175
Tim Peters772747b2001-08-09 22:21:55 +00005176 if (bo == -1) {
5177 /* force LE */
5178 ihi = 1;
5179 ilo = 0;
5180 }
5181 else if (bo == 1) {
5182 /* force BE */
5183 ihi = 0;
5184 ilo = 1;
5185 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005186#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5187 native_ordering = ilo < ihi;
5188#else
5189 native_ordering = ilo > ihi;
5190#endif
Tim Peters772747b2001-08-09 22:21:55 +00005191
Antoine Pitrouab868312009-01-10 15:40:25 +00005192 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005193 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005194 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005195 /* First check for possible aligned read of a C 'long'. Unaligned
5196 reads are more expensive, better to defer to another iteration. */
5197 if (!((size_t) q & LONG_PTR_MASK)) {
5198 /* Fast path for runs of non-surrogate chars. */
5199 register const unsigned char *_q = q;
5200 Py_UNICODE *_p = p;
5201 if (native_ordering) {
5202 /* Native ordering is simple: as long as the input cannot
5203 possibly contain a surrogate char, do an unrolled copy
5204 of several 16-bit code points to the target object.
5205 The non-surrogate check is done on several input bytes
5206 at a time (as many as a C 'long' can contain). */
5207 while (_q < aligned_end) {
5208 unsigned long data = * (unsigned long *) _q;
5209 if (data & FAST_CHAR_MASK)
5210 break;
5211 _p[0] = ((unsigned short *) _q)[0];
5212 _p[1] = ((unsigned short *) _q)[1];
5213#if (SIZEOF_LONG == 8)
5214 _p[2] = ((unsigned short *) _q)[2];
5215 _p[3] = ((unsigned short *) _q)[3];
5216#endif
5217 _q += SIZEOF_LONG;
5218 _p += SIZEOF_LONG / 2;
5219 }
5220 }
5221 else {
5222 /* Byteswapped ordering is similar, but we must decompose
5223 the copy bytewise, and take care of zero'ing out the
5224 upper bytes if the target object is in 32-bit units
5225 (that is, in UCS-4 builds). */
5226 while (_q < aligned_end) {
5227 unsigned long data = * (unsigned long *) _q;
5228 if (data & SWAPPED_FAST_CHAR_MASK)
5229 break;
5230 /* Zero upper bytes in UCS-4 builds */
5231#if (Py_UNICODE_SIZE > 2)
5232 _p[0] = 0;
5233 _p[1] = 0;
5234#if (SIZEOF_LONG == 8)
5235 _p[2] = 0;
5236 _p[3] = 0;
5237#endif
5238#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005239 /* Issue #4916; UCS-4 builds on big endian machines must
5240 fill the two last bytes of each 4-byte unit. */
5241#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
5242# define OFF 2
5243#else
5244# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00005245#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005246 ((unsigned char *) _p)[OFF + 1] = _q[0];
5247 ((unsigned char *) _p)[OFF + 0] = _q[1];
5248 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
5249 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
5250#if (SIZEOF_LONG == 8)
5251 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
5252 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
5253 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
5254 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
5255#endif
5256#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00005257 _q += SIZEOF_LONG;
5258 _p += SIZEOF_LONG / 2;
5259 }
5260 }
5261 p = _p;
5262 q = _q;
5263 if (q >= e)
5264 break;
5265 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005266 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005267
Benjamin Peterson14339b62009-01-31 16:36:08 +00005268 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005269
5270 if (ch < 0xD800 || ch > 0xDFFF) {
5271 *p++ = ch;
5272 continue;
5273 }
5274
5275 /* UTF-16 code pair: */
5276 if (q > e) {
5277 errmsg = "unexpected end of data";
5278 startinpos = (((const char *)q) - 2) - starts;
5279 endinpos = ((const char *)e) + 1 - starts;
5280 goto utf16Error;
5281 }
5282 if (0xD800 <= ch && ch <= 0xDBFF) {
5283 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
5284 q += 2;
5285 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00005286#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005287 *p++ = ch;
5288 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005289#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005290 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005291#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005292 continue;
5293 }
5294 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005295 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005296 startinpos = (((const char *)q)-4)-starts;
5297 endinpos = startinpos+2;
5298 goto utf16Error;
5299 }
5300
Benjamin Peterson14339b62009-01-31 16:36:08 +00005301 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005302 errmsg = "illegal encoding";
5303 startinpos = (((const char *)q)-2)-starts;
5304 endinpos = startinpos+2;
5305 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005306
Benjamin Peterson29060642009-01-31 22:14:21 +00005307 utf16Error:
5308 outpos = p - PyUnicode_AS_UNICODE(unicode);
5309 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005310 errors,
5311 &errorHandler,
5312 "utf16", errmsg,
5313 &starts,
5314 (const char **)&e,
5315 &startinpos,
5316 &endinpos,
5317 &exc,
5318 (const char **)&q,
5319 &unicode,
5320 &outpos,
5321 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00005322 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005323 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005324 /* remaining byte at the end? (size should be even) */
5325 if (e == q) {
5326 if (!consumed) {
5327 errmsg = "truncated data";
5328 startinpos = ((const char *)q) - starts;
5329 endinpos = ((const char *)e) + 1 - starts;
5330 outpos = p - PyUnicode_AS_UNICODE(unicode);
5331 if (unicode_decode_call_errorhandler(
5332 errors,
5333 &errorHandler,
5334 "utf16", errmsg,
5335 &starts,
5336 (const char **)&e,
5337 &startinpos,
5338 &endinpos,
5339 &exc,
5340 (const char **)&q,
5341 &unicode,
5342 &outpos,
5343 &p))
5344 goto onError;
5345 /* The remaining input chars are ignored if the callback
5346 chooses to skip the input */
5347 }
5348 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005349
5350 if (byteorder)
5351 *byteorder = bo;
5352
Walter Dörwald69652032004-09-07 20:24:22 +00005353 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005354 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005355
Guido van Rossumd57fd912000-03-10 22:53:23 +00005356 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02005357 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005358 goto onError;
5359
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005360 Py_XDECREF(errorHandler);
5361 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005362#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005363 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005364 Py_DECREF(unicode);
5365 return NULL;
5366 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005367#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005368 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005369 return (PyObject *)unicode;
5370
Benjamin Peterson29060642009-01-31 22:14:21 +00005371 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005372 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005373 Py_XDECREF(errorHandler);
5374 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005375 return NULL;
5376}
5377
Antoine Pitrouab868312009-01-10 15:40:25 +00005378#undef FAST_CHAR_MASK
5379#undef SWAPPED_FAST_CHAR_MASK
5380
Tim Peters772747b2001-08-09 22:21:55 +00005381PyObject *
5382PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005383 Py_ssize_t size,
5384 const char *errors,
5385 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005386{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005387 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005388 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005389 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005390#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005391 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005392#else
5393 const int pairs = 0;
5394#endif
Tim Peters772747b2001-08-09 22:21:55 +00005395 /* Offsets from p for storing byte pairs in the right order. */
5396#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5397 int ihi = 1, ilo = 0;
5398#else
5399 int ihi = 0, ilo = 1;
5400#endif
5401
Benjamin Peterson29060642009-01-31 22:14:21 +00005402#define STORECHAR(CH) \
5403 do { \
5404 p[ihi] = ((CH) >> 8) & 0xff; \
5405 p[ilo] = (CH) & 0xff; \
5406 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005407 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005408
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005409#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005410 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005411 if (s[i] >= 0x10000)
5412 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005413#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005414 /* 2 * (size + pairs + (byteorder == 0)) */
5415 if (size > PY_SSIZE_T_MAX ||
5416 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005417 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005418 nsize = size + pairs + (byteorder == 0);
5419 bytesize = nsize * 2;
5420 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005421 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005422 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005423 if (v == NULL)
5424 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005425
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005426 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005427 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005428 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005429 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005430 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005431
5432 if (byteorder == -1) {
5433 /* force LE */
5434 ihi = 1;
5435 ilo = 0;
5436 }
5437 else if (byteorder == 1) {
5438 /* force BE */
5439 ihi = 0;
5440 ilo = 1;
5441 }
5442
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005443 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005444 Py_UNICODE ch = *s++;
5445 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005446#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005447 if (ch >= 0x10000) {
5448 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5449 ch = 0xD800 | ((ch-0x10000) >> 10);
5450 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005451#endif
Tim Peters772747b2001-08-09 22:21:55 +00005452 STORECHAR(ch);
5453 if (ch2)
5454 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005455 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005456
5457 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005458 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005459#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005460}
5461
Alexander Belopolsky40018472011-02-26 01:02:56 +00005462PyObject *
5463PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005464{
5465 if (!PyUnicode_Check(unicode)) {
5466 PyErr_BadArgument();
5467 return NULL;
5468 }
5469 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005470 PyUnicode_GET_SIZE(unicode),
5471 NULL,
5472 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005473}
5474
5475/* --- Unicode Escape Codec ----------------------------------------------- */
5476
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005477/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5478 if all the escapes in the string make it still a valid ASCII string.
5479 Returns -1 if any escapes were found which cause the string to
5480 pop out of ASCII range. Otherwise returns the length of the
5481 required buffer to hold the string.
5482 */
5483Py_ssize_t
5484length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5485{
5486 const unsigned char *p = (const unsigned char *)s;
5487 const unsigned char *end = p + size;
5488 Py_ssize_t length = 0;
5489
5490 if (size < 0)
5491 return -1;
5492
5493 for (; p < end; ++p) {
5494 if (*p > 127) {
5495 /* Non-ASCII */
5496 return -1;
5497 }
5498 else if (*p != '\\') {
5499 /* Normal character */
5500 ++length;
5501 }
5502 else {
5503 /* Backslash-escape, check next char */
5504 ++p;
5505 /* Escape sequence reaches till end of string or
5506 non-ASCII follow-up. */
5507 if (p >= end || *p > 127)
5508 return -1;
5509 switch (*p) {
5510 case '\n':
5511 /* backslash + \n result in zero characters */
5512 break;
5513 case '\\': case '\'': case '\"':
5514 case 'b': case 'f': case 't':
5515 case 'n': case 'r': case 'v': case 'a':
5516 ++length;
5517 break;
5518 case '0': case '1': case '2': case '3':
5519 case '4': case '5': case '6': case '7':
5520 case 'x': case 'u': case 'U': case 'N':
5521 /* these do not guarantee ASCII characters */
5522 return -1;
5523 default:
5524 /* count the backslash + the other character */
5525 length += 2;
5526 }
5527 }
5528 }
5529 return length;
5530}
5531
5532/* Similar to PyUnicode_WRITE but either write into wstr field
5533 or treat string as ASCII. */
5534#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5535 do { \
5536 if ((kind) != PyUnicode_WCHAR_KIND) \
5537 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5538 else \
5539 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5540 } while (0)
5541
5542#define WRITE_WSTR(buf, index, value) \
5543 assert(kind == PyUnicode_WCHAR_KIND), \
5544 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5545
5546
Fredrik Lundh06d12682001-01-24 07:59:11 +00005547static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005548
Alexander Belopolsky40018472011-02-26 01:02:56 +00005549PyObject *
5550PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005551 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005552 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005553{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005554 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005555 Py_ssize_t startinpos;
5556 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005557 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005558 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005559 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005560 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005561 char* message;
5562 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005563 PyObject *errorHandler = NULL;
5564 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005565 Py_ssize_t ascii_length;
5566 Py_ssize_t i;
5567 int kind;
5568 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005569
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005570 ascii_length = length_of_escaped_ascii_string(s, size);
5571
5572 /* After length_of_escaped_ascii_string() there are two alternatives,
5573 either the string is pure ASCII with named escapes like \n, etc.
5574 and we determined it's exact size (common case)
5575 or it contains \x, \u, ... escape sequences. then we create a
5576 legacy wchar string and resize it at the end of this function. */
5577 if (ascii_length >= 0) {
5578 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
5579 if (!v)
5580 goto onError;
5581 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5582 kind = PyUnicode_1BYTE_KIND;
5583 data = PyUnicode_DATA(v);
5584 }
5585 else {
5586 /* Escaped strings will always be longer than the resulting
5587 Unicode string, so we start with size here and then reduce the
5588 length after conversion to the true value.
5589 (but if the error callback returns a long replacement string
5590 we'll have to allocate more space) */
5591 v = _PyUnicode_New(size);
5592 if (!v)
5593 goto onError;
5594 kind = PyUnicode_WCHAR_KIND;
5595 data = PyUnicode_AS_UNICODE(v);
5596 }
5597
Guido van Rossumd57fd912000-03-10 22:53:23 +00005598 if (size == 0)
5599 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005600 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005601 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005602
Guido van Rossumd57fd912000-03-10 22:53:23 +00005603 while (s < end) {
5604 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005605 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005606 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005607
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005608 if (kind == PyUnicode_WCHAR_KIND) {
5609 assert(i < _PyUnicode_WSTR_LENGTH(v));
5610 }
5611 else {
5612 /* The only case in which i == ascii_length is a backslash
5613 followed by a newline. */
5614 assert(i <= ascii_length);
5615 }
5616
Guido van Rossumd57fd912000-03-10 22:53:23 +00005617 /* Non-escape characters are interpreted as Unicode ordinals */
5618 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005619 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005620 continue;
5621 }
5622
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005623 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005624 /* \ - Escapes */
5625 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005626 c = *s++;
5627 if (s > end)
5628 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005629
5630 if (kind == PyUnicode_WCHAR_KIND) {
5631 assert(i < _PyUnicode_WSTR_LENGTH(v));
5632 }
5633 else {
5634 /* The only case in which i == ascii_length is a backslash
5635 followed by a newline. */
5636 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5637 }
5638
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005639 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005640
Benjamin Peterson29060642009-01-31 22:14:21 +00005641 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005642 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005643 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5644 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5645 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5646 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5647 /* FF */
5648 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5649 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5650 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5651 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5652 /* VT */
5653 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5654 /* BEL, not classic C */
5655 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005656
Benjamin Peterson29060642009-01-31 22:14:21 +00005657 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005658 case '0': case '1': case '2': case '3':
5659 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005660 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005661 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005662 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005663 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005664 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005665 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005666 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005667 break;
5668
Benjamin Peterson29060642009-01-31 22:14:21 +00005669 /* hex escapes */
5670 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005671 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005672 digits = 2;
5673 message = "truncated \\xXX escape";
5674 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005675
Benjamin Peterson29060642009-01-31 22:14:21 +00005676 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005677 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005678 digits = 4;
5679 message = "truncated \\uXXXX escape";
5680 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005681
Benjamin Peterson29060642009-01-31 22:14:21 +00005682 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005683 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005684 digits = 8;
5685 message = "truncated \\UXXXXXXXX escape";
5686 hexescape:
5687 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005688 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005689 if (s+digits>end) {
5690 endinpos = size;
5691 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005692 errors, &errorHandler,
5693 "unicodeescape", "end of string in escape sequence",
5694 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005695 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005696 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005697 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005698 goto nextByte;
5699 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005700 for (j = 0; j < digits; ++j) {
5701 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005702 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005703 endinpos = (s+j+1)-starts;
5704 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005705 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005706 errors, &errorHandler,
5707 "unicodeescape", message,
5708 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005709 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005710 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005711 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005712 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005713 }
5714 chr = (chr<<4) & ~0xF;
5715 if (c >= '0' && c <= '9')
5716 chr += c - '0';
5717 else if (c >= 'a' && c <= 'f')
5718 chr += 10 + c - 'a';
5719 else
5720 chr += 10 + c - 'A';
5721 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005722 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005723 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005724 /* _decoding_error will have already written into the
5725 target buffer. */
5726 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005727 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005728 /* when we get here, chr is a 32-bit unicode character */
5729 if (chr <= 0xffff)
5730 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005731 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005732 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005733 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005734 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005735#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005736 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005737#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005738 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005739 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5740 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005741#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005742 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005743 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005744 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005745 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005746 errors, &errorHandler,
5747 "unicodeescape", "illegal Unicode character",
5748 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005749 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005750 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005751 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005752 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005753 break;
5754
Benjamin Peterson29060642009-01-31 22:14:21 +00005755 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005756 case 'N':
5757 message = "malformed \\N character escape";
5758 if (ucnhash_CAPI == NULL) {
5759 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005760 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5761 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005762 if (ucnhash_CAPI == NULL)
5763 goto ucnhashError;
5764 }
5765 if (*s == '{') {
5766 const char *start = s+1;
5767 /* look for the closing brace */
5768 while (*s != '}' && s < end)
5769 s++;
5770 if (s > start && s < end && *s == '}') {
5771 /* found a name. look it up in the unicode database */
5772 message = "unknown Unicode character name";
5773 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005774 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5775 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005776 goto store;
5777 }
5778 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005779 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005780 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005781 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005782 errors, &errorHandler,
5783 "unicodeescape", message,
5784 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005785 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005786 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005787 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005788 break;
5789
5790 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005791 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005792 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005793 message = "\\ at end of string";
5794 s--;
5795 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005796 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005797 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005798 errors, &errorHandler,
5799 "unicodeescape", message,
5800 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005801 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005802 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005803 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005804 }
5805 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005806 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5807 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005808 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005809 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005810 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005811 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005812 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005813 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005814 /* Ensure the length prediction worked in case of ASCII strings */
5815 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5816
Victor Stinnerfe226c02011-10-03 03:52:20 +02005817 if (kind == PyUnicode_WCHAR_KIND)
5818 {
5819 if (PyUnicode_Resize((PyObject**)&v, i) < 0)
5820 goto onError;
Victor Stinnerfe226c02011-10-03 03:52:20 +02005821 }
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005822 Py_XDECREF(errorHandler);
5823 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005824#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005825 if (_PyUnicode_READY_REPLACE(&v)) {
5826 Py_DECREF(v);
5827 return NULL;
5828 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005829#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005830 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005831 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005832
Benjamin Peterson29060642009-01-31 22:14:21 +00005833 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005834 PyErr_SetString(
5835 PyExc_UnicodeError,
5836 "\\N escapes not supported (can't load unicodedata module)"
5837 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005838 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005839 Py_XDECREF(errorHandler);
5840 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005841 return NULL;
5842
Benjamin Peterson29060642009-01-31 22:14:21 +00005843 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005844 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005845 Py_XDECREF(errorHandler);
5846 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005847 return NULL;
5848}
5849
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005850#undef WRITE_ASCII_OR_WSTR
5851#undef WRITE_WSTR
5852
Guido van Rossumd57fd912000-03-10 22:53:23 +00005853/* Return a Unicode-Escape string version of the Unicode object.
5854
5855 If quotes is true, the string is enclosed in u"" or u'' quotes as
5856 appropriate.
5857
5858*/
5859
Walter Dörwald79e913e2007-05-12 11:08:06 +00005860static const char *hexdigits = "0123456789abcdef";
5861
Alexander Belopolsky40018472011-02-26 01:02:56 +00005862PyObject *
5863PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005864 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005865{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005866 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005867 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005868
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005869#ifdef Py_UNICODE_WIDE
5870 const Py_ssize_t expandsize = 10;
5871#else
5872 const Py_ssize_t expandsize = 6;
5873#endif
5874
Thomas Wouters89f507f2006-12-13 04:49:30 +00005875 /* XXX(nnorwitz): rather than over-allocating, it would be
5876 better to choose a different scheme. Perhaps scan the
5877 first N-chars of the string and allocate based on that size.
5878 */
5879 /* Initial allocation is based on the longest-possible unichr
5880 escape.
5881
5882 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5883 unichr, so in this case it's the longest unichr escape. In
5884 narrow (UTF-16) builds this is five chars per source unichr
5885 since there are two unichrs in the surrogate pair, so in narrow
5886 (UTF-16) builds it's not the longest unichr escape.
5887
5888 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5889 so in the narrow (UTF-16) build case it's the longest unichr
5890 escape.
5891 */
5892
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005893 if (size == 0)
5894 return PyBytes_FromStringAndSize(NULL, 0);
5895
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005896 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005897 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005898
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005899 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005900 2
5901 + expandsize*size
5902 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005903 if (repr == NULL)
5904 return NULL;
5905
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005906 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005907
Guido van Rossumd57fd912000-03-10 22:53:23 +00005908 while (size-- > 0) {
5909 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005910
Walter Dörwald79e913e2007-05-12 11:08:06 +00005911 /* Escape backslashes */
5912 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005913 *p++ = '\\';
5914 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005915 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005916 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005917
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005918#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005919 /* Map 21-bit characters to '\U00xxxxxx' */
5920 else if (ch >= 0x10000) {
5921 *p++ = '\\';
5922 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005923 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5924 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5925 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5926 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5927 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5928 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5929 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5930 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005931 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005932 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005933#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005934 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5935 else if (ch >= 0xD800 && ch < 0xDC00) {
5936 Py_UNICODE ch2;
5937 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005938
Benjamin Peterson29060642009-01-31 22:14:21 +00005939 ch2 = *s++;
5940 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005941 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005942 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5943 *p++ = '\\';
5944 *p++ = 'U';
5945 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5946 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5947 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5948 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5949 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5950 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5951 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5952 *p++ = hexdigits[ucs & 0x0000000F];
5953 continue;
5954 }
5955 /* Fall through: isolated surrogates are copied as-is */
5956 s--;
5957 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005958 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005959#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005960
Guido van Rossumd57fd912000-03-10 22:53:23 +00005961 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005962 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005963 *p++ = '\\';
5964 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005965 *p++ = hexdigits[(ch >> 12) & 0x000F];
5966 *p++ = hexdigits[(ch >> 8) & 0x000F];
5967 *p++ = hexdigits[(ch >> 4) & 0x000F];
5968 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005969 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005970
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005971 /* Map special whitespace to '\t', \n', '\r' */
5972 else if (ch == '\t') {
5973 *p++ = '\\';
5974 *p++ = 't';
5975 }
5976 else if (ch == '\n') {
5977 *p++ = '\\';
5978 *p++ = 'n';
5979 }
5980 else if (ch == '\r') {
5981 *p++ = '\\';
5982 *p++ = 'r';
5983 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005984
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005985 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005986 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005988 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005989 *p++ = hexdigits[(ch >> 4) & 0x000F];
5990 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005991 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005992
Guido van Rossumd57fd912000-03-10 22:53:23 +00005993 /* Copy everything else as-is */
5994 else
5995 *p++ = (char) ch;
5996 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005997
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005998 assert(p - PyBytes_AS_STRING(repr) > 0);
5999 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6000 return NULL;
6001 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006002}
6003
Alexander Belopolsky40018472011-02-26 01:02:56 +00006004PyObject *
6005PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006006{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006007 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006008 if (!PyUnicode_Check(unicode)) {
6009 PyErr_BadArgument();
6010 return NULL;
6011 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00006012 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6013 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006014 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006015}
6016
6017/* --- Raw Unicode Escape Codec ------------------------------------------- */
6018
Alexander Belopolsky40018472011-02-26 01:02:56 +00006019PyObject *
6020PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006021 Py_ssize_t size,
6022 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006024 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006025 Py_ssize_t startinpos;
6026 Py_ssize_t endinpos;
6027 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006029 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006030 const char *end;
6031 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006032 PyObject *errorHandler = NULL;
6033 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006034
Guido van Rossumd57fd912000-03-10 22:53:23 +00006035 /* Escaped strings will always be longer than the resulting
6036 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006037 length after conversion to the true value. (But decoding error
6038 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006039 v = _PyUnicode_New(size);
6040 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006041 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006042 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006043 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006044 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006045 end = s + size;
6046 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006047 unsigned char c;
6048 Py_UCS4 x;
6049 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006050 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006051
Benjamin Peterson29060642009-01-31 22:14:21 +00006052 /* Non-escape characters are interpreted as Unicode ordinals */
6053 if (*s != '\\') {
6054 *p++ = (unsigned char)*s++;
6055 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006056 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006057 startinpos = s-starts;
6058
6059 /* \u-escapes are only interpreted iff the number of leading
6060 backslashes if odd */
6061 bs = s;
6062 for (;s < end;) {
6063 if (*s != '\\')
6064 break;
6065 *p++ = (unsigned char)*s++;
6066 }
6067 if (((s - bs) & 1) == 0 ||
6068 s >= end ||
6069 (*s != 'u' && *s != 'U')) {
6070 continue;
6071 }
6072 p--;
6073 count = *s=='u' ? 4 : 8;
6074 s++;
6075
6076 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
6077 outpos = p-PyUnicode_AS_UNICODE(v);
6078 for (x = 0, i = 0; i < count; ++i, ++s) {
6079 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006080 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006081 endinpos = s-starts;
6082 if (unicode_decode_call_errorhandler(
6083 errors, &errorHandler,
6084 "rawunicodeescape", "truncated \\uXXXX",
6085 &starts, &end, &startinpos, &endinpos, &exc, &s,
6086 &v, &outpos, &p))
6087 goto onError;
6088 goto nextByte;
6089 }
6090 x = (x<<4) & ~0xF;
6091 if (c >= '0' && c <= '9')
6092 x += c - '0';
6093 else if (c >= 'a' && c <= 'f')
6094 x += 10 + c - 'a';
6095 else
6096 x += 10 + c - 'A';
6097 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00006098 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00006099 /* UCS-2 character */
6100 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006101 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006102 /* UCS-4 character. Either store directly, or as
6103 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00006104#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006105 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006106#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006107 x -= 0x10000L;
6108 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
6109 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00006110#endif
6111 } else {
6112 endinpos = s-starts;
6113 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006114 if (unicode_decode_call_errorhandler(
6115 errors, &errorHandler,
6116 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006117 &starts, &end, &startinpos, &endinpos, &exc, &s,
6118 &v, &outpos, &p))
6119 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006120 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006121 nextByte:
6122 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006123 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02006124 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006125 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006126 Py_XDECREF(errorHandler);
6127 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006128#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006129 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006130 Py_DECREF(v);
6131 return NULL;
6132 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006133#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006134 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006135 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006136
Benjamin Peterson29060642009-01-31 22:14:21 +00006137 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006138 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006139 Py_XDECREF(errorHandler);
6140 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006141 return NULL;
6142}
6143
Alexander Belopolsky40018472011-02-26 01:02:56 +00006144PyObject *
6145PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006146 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006147{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006148 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149 char *p;
6150 char *q;
6151
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006152#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006153 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006154#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006155 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006156#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00006157
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006158 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006159 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006160
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006161 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006162 if (repr == NULL)
6163 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00006164 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006165 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006166
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006167 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006168 while (size-- > 0) {
6169 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006170#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006171 /* Map 32-bit characters to '\Uxxxxxxxx' */
6172 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006173 *p++ = '\\';
6174 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00006175 *p++ = hexdigits[(ch >> 28) & 0xf];
6176 *p++ = hexdigits[(ch >> 24) & 0xf];
6177 *p++ = hexdigits[(ch >> 20) & 0xf];
6178 *p++ = hexdigits[(ch >> 16) & 0xf];
6179 *p++ = hexdigits[(ch >> 12) & 0xf];
6180 *p++ = hexdigits[(ch >> 8) & 0xf];
6181 *p++ = hexdigits[(ch >> 4) & 0xf];
6182 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006183 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006184 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00006185#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006186 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
6187 if (ch >= 0xD800 && ch < 0xDC00) {
6188 Py_UNICODE ch2;
6189 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006190
Benjamin Peterson29060642009-01-31 22:14:21 +00006191 ch2 = *s++;
6192 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00006193 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006194 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
6195 *p++ = '\\';
6196 *p++ = 'U';
6197 *p++ = hexdigits[(ucs >> 28) & 0xf];
6198 *p++ = hexdigits[(ucs >> 24) & 0xf];
6199 *p++ = hexdigits[(ucs >> 20) & 0xf];
6200 *p++ = hexdigits[(ucs >> 16) & 0xf];
6201 *p++ = hexdigits[(ucs >> 12) & 0xf];
6202 *p++ = hexdigits[(ucs >> 8) & 0xf];
6203 *p++ = hexdigits[(ucs >> 4) & 0xf];
6204 *p++ = hexdigits[ucs & 0xf];
6205 continue;
6206 }
6207 /* Fall through: isolated surrogates are copied as-is */
6208 s--;
6209 size++;
6210 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006211#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006212 /* Map 16-bit characters to '\uxxxx' */
6213 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006214 *p++ = '\\';
6215 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00006216 *p++ = hexdigits[(ch >> 12) & 0xf];
6217 *p++ = hexdigits[(ch >> 8) & 0xf];
6218 *p++ = hexdigits[(ch >> 4) & 0xf];
6219 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006220 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006221 /* Copy everything else as-is */
6222 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006223 *p++ = (char) ch;
6224 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006225 size = p - q;
6226
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006227 assert(size > 0);
6228 if (_PyBytes_Resize(&repr, size) < 0)
6229 return NULL;
6230 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006231}
6232
Alexander Belopolsky40018472011-02-26 01:02:56 +00006233PyObject *
6234PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006235{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006236 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006237 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00006238 PyErr_BadArgument();
6239 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006240 }
Walter Dörwald711005d2007-05-12 12:03:26 +00006241 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6242 PyUnicode_GET_SIZE(unicode));
6243
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006244 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006245}
6246
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006247/* --- Unicode Internal Codec ------------------------------------------- */
6248
Alexander Belopolsky40018472011-02-26 01:02:56 +00006249PyObject *
6250_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006251 Py_ssize_t size,
6252 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006253{
6254 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006255 Py_ssize_t startinpos;
6256 Py_ssize_t endinpos;
6257 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006258 PyUnicodeObject *v;
6259 Py_UNICODE *p;
6260 const char *end;
6261 const char *reason;
6262 PyObject *errorHandler = NULL;
6263 PyObject *exc = NULL;
6264
Neal Norwitzd43069c2006-01-08 01:12:10 +00006265#ifdef Py_UNICODE_WIDE
6266 Py_UNICODE unimax = PyUnicode_GetMax();
6267#endif
6268
Thomas Wouters89f507f2006-12-13 04:49:30 +00006269 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006270 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
6271 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006272 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006273 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
6274 as string was created with the old API. */
6275 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006276 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006277 p = PyUnicode_AS_UNICODE(v);
6278 end = s + size;
6279
6280 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006281 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006282 /* We have to sanity check the raw data, otherwise doom looms for
6283 some malformed UCS-4 data. */
6284 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006285#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006286 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006287#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006288 end-s < Py_UNICODE_SIZE
6289 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006290 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006291 startinpos = s - starts;
6292 if (end-s < Py_UNICODE_SIZE) {
6293 endinpos = end-starts;
6294 reason = "truncated input";
6295 }
6296 else {
6297 endinpos = s - starts + Py_UNICODE_SIZE;
6298 reason = "illegal code point (> 0x10FFFF)";
6299 }
6300 outpos = p - PyUnicode_AS_UNICODE(v);
6301 if (unicode_decode_call_errorhandler(
6302 errors, &errorHandler,
6303 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006304 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00006305 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006306 goto onError;
6307 }
6308 }
6309 else {
6310 p++;
6311 s += Py_UNICODE_SIZE;
6312 }
6313 }
6314
Victor Stinnerfe226c02011-10-03 03:52:20 +02006315 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006316 goto onError;
6317 Py_XDECREF(errorHandler);
6318 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006319#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006320 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006321 Py_DECREF(v);
6322 return NULL;
6323 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006324#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006325 assert(_PyUnicode_CheckConsistency(v, 1));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006326 return (PyObject *)v;
6327
Benjamin Peterson29060642009-01-31 22:14:21 +00006328 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006329 Py_XDECREF(v);
6330 Py_XDECREF(errorHandler);
6331 Py_XDECREF(exc);
6332 return NULL;
6333}
6334
Guido van Rossumd57fd912000-03-10 22:53:23 +00006335/* --- Latin-1 Codec ------------------------------------------------------ */
6336
Alexander Belopolsky40018472011-02-26 01:02:56 +00006337PyObject *
6338PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006339 Py_ssize_t size,
6340 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006341{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006342 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006343 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006344}
6345
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006346/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006347static void
6348make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006349 const char *encoding,
6350 const Py_UNICODE *unicode, Py_ssize_t size,
6351 Py_ssize_t startpos, Py_ssize_t endpos,
6352 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006353{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006354 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006355 *exceptionObject = PyUnicodeEncodeError_Create(
6356 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006357 }
6358 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006359 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6360 goto onError;
6361 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6362 goto onError;
6363 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6364 goto onError;
6365 return;
6366 onError:
6367 Py_DECREF(*exceptionObject);
6368 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006369 }
6370}
6371
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006372/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006373static void
6374raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006375 const char *encoding,
6376 const Py_UNICODE *unicode, Py_ssize_t size,
6377 Py_ssize_t startpos, Py_ssize_t endpos,
6378 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006379{
6380 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006381 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006382 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006383 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006384}
6385
6386/* error handling callback helper:
6387 build arguments, call the callback and check the arguments,
6388 put the result into newpos and return the replacement string, which
6389 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006390static PyObject *
6391unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006392 PyObject **errorHandler,
6393 const char *encoding, const char *reason,
6394 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
6395 Py_ssize_t startpos, Py_ssize_t endpos,
6396 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006397{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006398 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006399
6400 PyObject *restuple;
6401 PyObject *resunicode;
6402
6403 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006404 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006405 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006406 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006407 }
6408
6409 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006410 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006411 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006412 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006413
6414 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006415 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006416 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006417 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006418 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006419 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006420 Py_DECREF(restuple);
6421 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006422 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006423 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006424 &resunicode, newpos)) {
6425 Py_DECREF(restuple);
6426 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006427 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006428 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6429 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6430 Py_DECREF(restuple);
6431 return NULL;
6432 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006433 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006434 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006435 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006436 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6437 Py_DECREF(restuple);
6438 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006439 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006440 Py_INCREF(resunicode);
6441 Py_DECREF(restuple);
6442 return resunicode;
6443}
6444
Alexander Belopolsky40018472011-02-26 01:02:56 +00006445static PyObject *
6446unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006447 Py_ssize_t size,
6448 const char *errors,
6449 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006450{
6451 /* output object */
6452 PyObject *res;
6453 /* pointers to the beginning and end+1 of input */
6454 const Py_UNICODE *startp = p;
6455 const Py_UNICODE *endp = p + size;
6456 /* pointer to the beginning of the unencodable characters */
6457 /* const Py_UNICODE *badp = NULL; */
6458 /* pointer into the output */
6459 char *str;
6460 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006461 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006462 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6463 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006464 PyObject *errorHandler = NULL;
6465 PyObject *exc = NULL;
6466 /* the following variable is used for caching string comparisons
6467 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6468 int known_errorHandler = -1;
6469
6470 /* allocate enough for a simple encoding without
6471 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006472 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006473 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006474 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006475 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006476 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006477 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006478 ressize = size;
6479
6480 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006481 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006482
Benjamin Peterson29060642009-01-31 22:14:21 +00006483 /* can we encode this? */
6484 if (c<limit) {
6485 /* no overflow check, because we know that the space is enough */
6486 *str++ = (char)c;
6487 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006488 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006489 else {
6490 Py_ssize_t unicodepos = p-startp;
6491 Py_ssize_t requiredsize;
6492 PyObject *repunicode;
6493 Py_ssize_t repsize;
6494 Py_ssize_t newpos;
6495 Py_ssize_t respos;
6496 Py_UNICODE *uni2;
6497 /* startpos for collecting unencodable chars */
6498 const Py_UNICODE *collstart = p;
6499 const Py_UNICODE *collend = p;
6500 /* find all unecodable characters */
6501 while ((collend < endp) && ((*collend)>=limit))
6502 ++collend;
6503 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6504 if (known_errorHandler==-1) {
6505 if ((errors==NULL) || (!strcmp(errors, "strict")))
6506 known_errorHandler = 1;
6507 else if (!strcmp(errors, "replace"))
6508 known_errorHandler = 2;
6509 else if (!strcmp(errors, "ignore"))
6510 known_errorHandler = 3;
6511 else if (!strcmp(errors, "xmlcharrefreplace"))
6512 known_errorHandler = 4;
6513 else
6514 known_errorHandler = 0;
6515 }
6516 switch (known_errorHandler) {
6517 case 1: /* strict */
6518 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
6519 goto onError;
6520 case 2: /* replace */
6521 while (collstart++<collend)
6522 *str++ = '?'; /* fall through */
6523 case 3: /* ignore */
6524 p = collend;
6525 break;
6526 case 4: /* xmlcharrefreplace */
6527 respos = str - PyBytes_AS_STRING(res);
6528 /* determine replacement size (temporarily (mis)uses p) */
6529 for (p = collstart, repsize = 0; p < collend; ++p) {
6530 if (*p<10)
6531 repsize += 2+1+1;
6532 else if (*p<100)
6533 repsize += 2+2+1;
6534 else if (*p<1000)
6535 repsize += 2+3+1;
6536 else if (*p<10000)
6537 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006538#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006539 else
6540 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006541#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006542 else if (*p<100000)
6543 repsize += 2+5+1;
6544 else if (*p<1000000)
6545 repsize += 2+6+1;
6546 else
6547 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006548#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006549 }
6550 requiredsize = respos+repsize+(endp-collend);
6551 if (requiredsize > ressize) {
6552 if (requiredsize<2*ressize)
6553 requiredsize = 2*ressize;
6554 if (_PyBytes_Resize(&res, requiredsize))
6555 goto onError;
6556 str = PyBytes_AS_STRING(res) + respos;
6557 ressize = requiredsize;
6558 }
6559 /* generate replacement (temporarily (mis)uses p) */
6560 for (p = collstart; p < collend; ++p) {
6561 str += sprintf(str, "&#%d;", (int)*p);
6562 }
6563 p = collend;
6564 break;
6565 default:
6566 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6567 encoding, reason, startp, size, &exc,
6568 collstart-startp, collend-startp, &newpos);
6569 if (repunicode == NULL)
6570 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006571 if (PyBytes_Check(repunicode)) {
6572 /* Directly copy bytes result to output. */
6573 repsize = PyBytes_Size(repunicode);
6574 if (repsize > 1) {
6575 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006576 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006577 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6578 Py_DECREF(repunicode);
6579 goto onError;
6580 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006581 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006582 ressize += repsize-1;
6583 }
6584 memcpy(str, PyBytes_AsString(repunicode), repsize);
6585 str += repsize;
6586 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006587 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006588 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006589 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006590 /* need more space? (at least enough for what we
6591 have+the replacement+the rest of the string, so
6592 we won't have to check space for encodable characters) */
6593 respos = str - PyBytes_AS_STRING(res);
6594 repsize = PyUnicode_GET_SIZE(repunicode);
6595 requiredsize = respos+repsize+(endp-collend);
6596 if (requiredsize > ressize) {
6597 if (requiredsize<2*ressize)
6598 requiredsize = 2*ressize;
6599 if (_PyBytes_Resize(&res, requiredsize)) {
6600 Py_DECREF(repunicode);
6601 goto onError;
6602 }
6603 str = PyBytes_AS_STRING(res) + respos;
6604 ressize = requiredsize;
6605 }
6606 /* check if there is anything unencodable in the replacement
6607 and copy it to the output */
6608 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
6609 c = *uni2;
6610 if (c >= limit) {
6611 raise_encode_exception(&exc, encoding, startp, size,
6612 unicodepos, unicodepos+1, reason);
6613 Py_DECREF(repunicode);
6614 goto onError;
6615 }
6616 *str = (char)c;
6617 }
6618 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006619 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006620 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006621 }
6622 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006623 /* Resize if we allocated to much */
6624 size = str - PyBytes_AS_STRING(res);
6625 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006626 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006627 if (_PyBytes_Resize(&res, size) < 0)
6628 goto onError;
6629 }
6630
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006631 Py_XDECREF(errorHandler);
6632 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006633 return res;
6634
6635 onError:
6636 Py_XDECREF(res);
6637 Py_XDECREF(errorHandler);
6638 Py_XDECREF(exc);
6639 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006640}
6641
Alexander Belopolsky40018472011-02-26 01:02:56 +00006642PyObject *
6643PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006644 Py_ssize_t size,
6645 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006646{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006647 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006648}
6649
Alexander Belopolsky40018472011-02-26 01:02:56 +00006650PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006651_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006652{
6653 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006654 PyErr_BadArgument();
6655 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006656 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006657 if (PyUnicode_READY(unicode) == -1)
6658 return NULL;
6659 /* Fast path: if it is a one-byte string, construct
6660 bytes object directly. */
6661 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6662 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6663 PyUnicode_GET_LENGTH(unicode));
6664 /* Non-Latin-1 characters present. Defer to above function to
6665 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006666 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006667 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006668 errors);
6669}
6670
6671PyObject*
6672PyUnicode_AsLatin1String(PyObject *unicode)
6673{
6674 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006675}
6676
6677/* --- 7-bit ASCII Codec -------------------------------------------------- */
6678
Alexander Belopolsky40018472011-02-26 01:02:56 +00006679PyObject *
6680PyUnicode_DecodeASCII(const char *s,
6681 Py_ssize_t size,
6682 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006683{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006684 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006685 PyUnicodeObject *v;
Victor Stinner702c7342011-10-05 13:50:52 +02006686 Py_UNICODE *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006687 Py_ssize_t startinpos;
6688 Py_ssize_t endinpos;
6689 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006690 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006691 int has_error;
6692 const unsigned char *p = (const unsigned char *)s;
6693 const unsigned char *end = p + size;
6694 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006695 PyObject *errorHandler = NULL;
6696 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006697
Guido van Rossumd57fd912000-03-10 22:53:23 +00006698 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006699 if (size == 1 && (unsigned char)s[0] < 128)
6700 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006701
Victor Stinner702c7342011-10-05 13:50:52 +02006702 has_error = 0;
6703 while (p < end && !has_error) {
6704 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6705 an explanation. */
6706 if (!((size_t) p & LONG_PTR_MASK)) {
6707 /* Help register allocation */
6708 register const unsigned char *_p = p;
6709 while (_p < aligned_end) {
6710 unsigned long value = *(unsigned long *) _p;
6711 if (value & ASCII_CHAR_MASK) {
6712 has_error = 1;
6713 break;
6714 }
6715 _p += SIZEOF_LONG;
6716 }
6717 if (_p == end)
6718 break;
6719 if (has_error)
6720 break;
6721 p = _p;
6722 }
6723 if (*p & 0x80) {
6724 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006725 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006726 }
6727 else {
6728 ++p;
6729 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006730 }
Victor Stinner702c7342011-10-05 13:50:52 +02006731 if (!has_error)
6732 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006733
Guido van Rossumd57fd912000-03-10 22:53:23 +00006734 v = _PyUnicode_New(size);
6735 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006736 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006737 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006738 return (PyObject *)v;
Victor Stinner702c7342011-10-05 13:50:52 +02006739 u = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006740 e = s + size;
6741 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006742 register unsigned char c = (unsigned char)*s;
6743 if (c < 128) {
Victor Stinner702c7342011-10-05 13:50:52 +02006744 *u++ = c;
Benjamin Peterson29060642009-01-31 22:14:21 +00006745 ++s;
6746 }
6747 else {
6748 startinpos = s-starts;
6749 endinpos = startinpos + 1;
Victor Stinner702c7342011-10-05 13:50:52 +02006750 outpos = u - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006751 if (unicode_decode_call_errorhandler(
6752 errors, &errorHandler,
6753 "ascii", "ordinal not in range(128)",
6754 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinner702c7342011-10-05 13:50:52 +02006755 &v, &outpos, &u))
Benjamin Peterson29060642009-01-31 22:14:21 +00006756 goto onError;
6757 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006758 }
Victor Stinner702c7342011-10-05 13:50:52 +02006759 if (u - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
6760 if (PyUnicode_Resize((PyObject**)&v, u - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006761 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006762 Py_XDECREF(errorHandler);
6763 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006764#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006765 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006766 Py_DECREF(v);
6767 return NULL;
6768 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006769#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006770 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006771 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006772
Benjamin Peterson29060642009-01-31 22:14:21 +00006773 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006774 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006775 Py_XDECREF(errorHandler);
6776 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006777 return NULL;
6778}
6779
Alexander Belopolsky40018472011-02-26 01:02:56 +00006780PyObject *
6781PyUnicode_EncodeASCII(const Py_UNICODE *p,
6782 Py_ssize_t size,
6783 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006784{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006785 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006786}
6787
Alexander Belopolsky40018472011-02-26 01:02:56 +00006788PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006789_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006790{
6791 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006792 PyErr_BadArgument();
6793 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006794 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006795 if (PyUnicode_READY(unicode) == -1)
6796 return NULL;
6797 /* Fast path: if it is an ASCII-only string, construct bytes object
6798 directly. Else defer to above function to raise the exception. */
6799 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6800 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6801 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006802 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006803 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006804 errors);
6805}
6806
6807PyObject *
6808PyUnicode_AsASCIIString(PyObject *unicode)
6809{
6810 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006811}
6812
Victor Stinner99b95382011-07-04 14:23:54 +02006813#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006814
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006815/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006816
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006817#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006818#define NEED_RETRY
6819#endif
6820
6821/* XXX This code is limited to "true" double-byte encodings, as
6822 a) it assumes an incomplete character consists of a single byte, and
6823 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006824 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006825
Alexander Belopolsky40018472011-02-26 01:02:56 +00006826static int
6827is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006828{
6829 const char *curr = s + offset;
6830
6831 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006832 const char *prev = CharPrev(s, curr);
6833 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006834 }
6835 return 0;
6836}
6837
6838/*
6839 * Decode MBCS string into unicode object. If 'final' is set, converts
6840 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6841 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006842static int
6843decode_mbcs(PyUnicodeObject **v,
6844 const char *s, /* MBCS string */
6845 int size, /* sizeof MBCS string */
6846 int final,
6847 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006848{
6849 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006850 Py_ssize_t n;
6851 DWORD usize;
6852 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006853
6854 assert(size >= 0);
6855
Victor Stinner554f3f02010-06-16 23:33:54 +00006856 /* check and handle 'errors' arg */
6857 if (errors==NULL || strcmp(errors, "strict")==0)
6858 flags = MB_ERR_INVALID_CHARS;
6859 else if (strcmp(errors, "ignore")==0)
6860 flags = 0;
6861 else {
6862 PyErr_Format(PyExc_ValueError,
6863 "mbcs encoding does not support errors='%s'",
6864 errors);
6865 return -1;
6866 }
6867
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006868 /* Skip trailing lead-byte unless 'final' is set */
6869 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006870 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006871
6872 /* First get the size of the result */
6873 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006874 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6875 if (usize==0)
6876 goto mbcs_decode_error;
6877 } else
6878 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006879
6880 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006881 /* Create unicode object */
6882 *v = _PyUnicode_New(usize);
6883 if (*v == NULL)
6884 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006885 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006886 }
6887 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006888 /* Extend unicode object */
6889 n = PyUnicode_GET_SIZE(*v);
Victor Stinner2fd82272011-10-03 04:06:05 +02006890 if (PyUnicode_Resize((PyObject**)v, n + usize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006891 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006892 }
6893
6894 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006895 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006896 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006897 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6898 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006899 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006900 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006901 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006902
6903mbcs_decode_error:
6904 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6905 we raise a UnicodeDecodeError - else it is a 'generic'
6906 windows error
6907 */
6908 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6909 /* Ideally, we should get reason from FormatMessage - this
6910 is the Windows 2000 English version of the message
6911 */
6912 PyObject *exc = NULL;
6913 const char *reason = "No mapping for the Unicode character exists "
6914 "in the target multi-byte code page.";
6915 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6916 if (exc != NULL) {
6917 PyCodec_StrictErrors(exc);
6918 Py_DECREF(exc);
6919 }
6920 } else {
6921 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6922 }
6923 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006924}
6925
Alexander Belopolsky40018472011-02-26 01:02:56 +00006926PyObject *
6927PyUnicode_DecodeMBCSStateful(const char *s,
6928 Py_ssize_t size,
6929 const char *errors,
6930 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006931{
6932 PyUnicodeObject *v = NULL;
6933 int done;
6934
6935 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006936 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006937
6938#ifdef NEED_RETRY
6939 retry:
6940 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006941 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006942 else
6943#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006944 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006945
6946 if (done < 0) {
6947 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006948 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006949 }
6950
6951 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006952 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006953
6954#ifdef NEED_RETRY
6955 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006956 s += done;
6957 size -= done;
6958 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006959 }
6960#endif
Victor Stinner17efeed2011-10-04 20:05:46 +02006961#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006962 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006963 Py_DECREF(v);
6964 return NULL;
6965 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006966#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006967 assert(_PyUnicode_CheckConsistency(v, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006968 return (PyObject *)v;
6969}
6970
Alexander Belopolsky40018472011-02-26 01:02:56 +00006971PyObject *
6972PyUnicode_DecodeMBCS(const char *s,
6973 Py_ssize_t size,
6974 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006975{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006976 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6977}
6978
6979/*
6980 * Convert unicode into string object (MBCS).
6981 * Returns 0 if succeed, -1 otherwise.
6982 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006983static int
6984encode_mbcs(PyObject **repr,
6985 const Py_UNICODE *p, /* unicode */
6986 int size, /* size of unicode */
6987 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006988{
Victor Stinner554f3f02010-06-16 23:33:54 +00006989 BOOL usedDefaultChar = FALSE;
6990 BOOL *pusedDefaultChar;
6991 int mbcssize;
6992 Py_ssize_t n;
6993 PyObject *exc = NULL;
6994 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006995
6996 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006997
Victor Stinner554f3f02010-06-16 23:33:54 +00006998 /* check and handle 'errors' arg */
6999 if (errors==NULL || strcmp(errors, "strict")==0) {
7000 flags = WC_NO_BEST_FIT_CHARS;
7001 pusedDefaultChar = &usedDefaultChar;
7002 } else if (strcmp(errors, "replace")==0) {
7003 flags = 0;
7004 pusedDefaultChar = NULL;
7005 } else {
7006 PyErr_Format(PyExc_ValueError,
7007 "mbcs encoding does not support errors='%s'",
7008 errors);
7009 return -1;
7010 }
7011
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007012 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007013 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00007014 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
7015 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00007016 if (mbcssize == 0) {
7017 PyErr_SetFromWindowsErrWithFilename(0, NULL);
7018 return -1;
7019 }
Victor Stinner554f3f02010-06-16 23:33:54 +00007020 /* If we used a default char, then we failed! */
7021 if (pusedDefaultChar && *pusedDefaultChar)
7022 goto mbcs_encode_error;
7023 } else {
7024 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007025 }
7026
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007027 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007028 /* Create string object */
7029 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
7030 if (*repr == NULL)
7031 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00007032 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007033 }
7034 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007035 /* Extend string object */
7036 n = PyBytes_Size(*repr);
7037 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
7038 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007039 }
7040
7041 /* Do the conversion */
7042 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007043 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00007044 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
7045 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007046 PyErr_SetFromWindowsErrWithFilename(0, NULL);
7047 return -1;
7048 }
Victor Stinner554f3f02010-06-16 23:33:54 +00007049 if (pusedDefaultChar && *pusedDefaultChar)
7050 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007051 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007052 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007053
7054mbcs_encode_error:
7055 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
7056 Py_XDECREF(exc);
7057 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007058}
7059
Alexander Belopolsky40018472011-02-26 01:02:56 +00007060PyObject *
7061PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7062 Py_ssize_t size,
7063 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007064{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007065 PyObject *repr = NULL;
7066 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00007067
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007068#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00007069 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007070 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00007071 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007072 else
7073#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00007074 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007075
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007076 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007077 Py_XDECREF(repr);
7078 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007079 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007080
7081#ifdef NEED_RETRY
7082 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007083 p += INT_MAX;
7084 size -= INT_MAX;
7085 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007086 }
7087#endif
7088
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007089 return repr;
7090}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007091
Alexander Belopolsky40018472011-02-26 01:02:56 +00007092PyObject *
7093PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007094{
7095 if (!PyUnicode_Check(unicode)) {
7096 PyErr_BadArgument();
7097 return NULL;
7098 }
7099 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007100 PyUnicode_GET_SIZE(unicode),
7101 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007102}
7103
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007104#undef NEED_RETRY
7105
Victor Stinner99b95382011-07-04 14:23:54 +02007106#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007107
Guido van Rossumd57fd912000-03-10 22:53:23 +00007108/* --- Character Mapping Codec -------------------------------------------- */
7109
Alexander Belopolsky40018472011-02-26 01:02:56 +00007110PyObject *
7111PyUnicode_DecodeCharmap(const char *s,
7112 Py_ssize_t size,
7113 PyObject *mapping,
7114 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007115{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007116 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007117 Py_ssize_t startinpos;
7118 Py_ssize_t endinpos;
7119 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007120 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007121 PyUnicodeObject *v;
7122 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007123 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007124 PyObject *errorHandler = NULL;
7125 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007126 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007127 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00007128
Guido van Rossumd57fd912000-03-10 22:53:23 +00007129 /* Default to Latin-1 */
7130 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007131 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007132
7133 v = _PyUnicode_New(size);
7134 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007135 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007136 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007137 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007138 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007139 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007140 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007141 mapstring = PyUnicode_AS_UNICODE(mapping);
7142 maplen = PyUnicode_GET_SIZE(mapping);
7143 while (s < e) {
7144 unsigned char ch = *s;
7145 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007146
Benjamin Peterson29060642009-01-31 22:14:21 +00007147 if (ch < maplen)
7148 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00007149
Benjamin Peterson29060642009-01-31 22:14:21 +00007150 if (x == 0xfffe) {
7151 /* undefined mapping */
7152 outpos = p-PyUnicode_AS_UNICODE(v);
7153 startinpos = s-starts;
7154 endinpos = startinpos+1;
7155 if (unicode_decode_call_errorhandler(
7156 errors, &errorHandler,
7157 "charmap", "character maps to <undefined>",
7158 &starts, &e, &startinpos, &endinpos, &exc, &s,
7159 &v, &outpos, &p)) {
7160 goto onError;
7161 }
7162 continue;
7163 }
7164 *p++ = x;
7165 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007166 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007167 }
7168 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007169 while (s < e) {
7170 unsigned char ch = *s;
7171 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007172
Benjamin Peterson29060642009-01-31 22:14:21 +00007173 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7174 w = PyLong_FromLong((long)ch);
7175 if (w == NULL)
7176 goto onError;
7177 x = PyObject_GetItem(mapping, w);
7178 Py_DECREF(w);
7179 if (x == NULL) {
7180 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7181 /* No mapping found means: mapping is undefined. */
7182 PyErr_Clear();
7183 x = Py_None;
7184 Py_INCREF(x);
7185 } else
7186 goto onError;
7187 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007188
Benjamin Peterson29060642009-01-31 22:14:21 +00007189 /* Apply mapping */
7190 if (PyLong_Check(x)) {
7191 long value = PyLong_AS_LONG(x);
7192 if (value < 0 || value > 65535) {
7193 PyErr_SetString(PyExc_TypeError,
7194 "character mapping must be in range(65536)");
7195 Py_DECREF(x);
7196 goto onError;
7197 }
7198 *p++ = (Py_UNICODE)value;
7199 }
7200 else if (x == Py_None) {
7201 /* undefined mapping */
7202 outpos = p-PyUnicode_AS_UNICODE(v);
7203 startinpos = s-starts;
7204 endinpos = startinpos+1;
7205 if (unicode_decode_call_errorhandler(
7206 errors, &errorHandler,
7207 "charmap", "character maps to <undefined>",
7208 &starts, &e, &startinpos, &endinpos, &exc, &s,
7209 &v, &outpos, &p)) {
7210 Py_DECREF(x);
7211 goto onError;
7212 }
7213 Py_DECREF(x);
7214 continue;
7215 }
7216 else if (PyUnicode_Check(x)) {
7217 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007218
Benjamin Peterson29060642009-01-31 22:14:21 +00007219 if (targetsize == 1)
7220 /* 1-1 mapping */
7221 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007222
Benjamin Peterson29060642009-01-31 22:14:21 +00007223 else if (targetsize > 1) {
7224 /* 1-n mapping */
7225 if (targetsize > extrachars) {
7226 /* resize first */
7227 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
7228 Py_ssize_t needed = (targetsize - extrachars) + \
7229 (targetsize << 2);
7230 extrachars += needed;
7231 /* XXX overflow detection missing */
Victor Stinnerfe226c02011-10-03 03:52:20 +02007232 if (PyUnicode_Resize((PyObject**)&v,
Benjamin Peterson29060642009-01-31 22:14:21 +00007233 PyUnicode_GET_SIZE(v) + needed) < 0) {
7234 Py_DECREF(x);
7235 goto onError;
7236 }
7237 p = PyUnicode_AS_UNICODE(v) + oldpos;
7238 }
7239 Py_UNICODE_COPY(p,
7240 PyUnicode_AS_UNICODE(x),
7241 targetsize);
7242 p += targetsize;
7243 extrachars -= targetsize;
7244 }
7245 /* 1-0 mapping: skip the character */
7246 }
7247 else {
7248 /* wrong return value */
7249 PyErr_SetString(PyExc_TypeError,
7250 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007251 Py_DECREF(x);
7252 goto onError;
7253 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007254 Py_DECREF(x);
7255 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007256 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007257 }
7258 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02007259 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007260 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007261 Py_XDECREF(errorHandler);
7262 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02007263#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007264 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007265 Py_DECREF(v);
7266 return NULL;
7267 }
Victor Stinner17efeed2011-10-04 20:05:46 +02007268#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007269 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007270 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00007271
Benjamin Peterson29060642009-01-31 22:14:21 +00007272 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007273 Py_XDECREF(errorHandler);
7274 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007275 Py_XDECREF(v);
7276 return NULL;
7277}
7278
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007279/* Charmap encoding: the lookup table */
7280
Alexander Belopolsky40018472011-02-26 01:02:56 +00007281struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007282 PyObject_HEAD
7283 unsigned char level1[32];
7284 int count2, count3;
7285 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007286};
7287
7288static PyObject*
7289encoding_map_size(PyObject *obj, PyObject* args)
7290{
7291 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007292 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007293 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007294}
7295
7296static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007297 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007298 PyDoc_STR("Return the size (in bytes) of this object") },
7299 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007300};
7301
7302static void
7303encoding_map_dealloc(PyObject* o)
7304{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007305 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007306}
7307
7308static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007309 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007310 "EncodingMap", /*tp_name*/
7311 sizeof(struct encoding_map), /*tp_basicsize*/
7312 0, /*tp_itemsize*/
7313 /* methods */
7314 encoding_map_dealloc, /*tp_dealloc*/
7315 0, /*tp_print*/
7316 0, /*tp_getattr*/
7317 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007318 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007319 0, /*tp_repr*/
7320 0, /*tp_as_number*/
7321 0, /*tp_as_sequence*/
7322 0, /*tp_as_mapping*/
7323 0, /*tp_hash*/
7324 0, /*tp_call*/
7325 0, /*tp_str*/
7326 0, /*tp_getattro*/
7327 0, /*tp_setattro*/
7328 0, /*tp_as_buffer*/
7329 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7330 0, /*tp_doc*/
7331 0, /*tp_traverse*/
7332 0, /*tp_clear*/
7333 0, /*tp_richcompare*/
7334 0, /*tp_weaklistoffset*/
7335 0, /*tp_iter*/
7336 0, /*tp_iternext*/
7337 encoding_map_methods, /*tp_methods*/
7338 0, /*tp_members*/
7339 0, /*tp_getset*/
7340 0, /*tp_base*/
7341 0, /*tp_dict*/
7342 0, /*tp_descr_get*/
7343 0, /*tp_descr_set*/
7344 0, /*tp_dictoffset*/
7345 0, /*tp_init*/
7346 0, /*tp_alloc*/
7347 0, /*tp_new*/
7348 0, /*tp_free*/
7349 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007350};
7351
7352PyObject*
7353PyUnicode_BuildEncodingMap(PyObject* string)
7354{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007355 PyObject *result;
7356 struct encoding_map *mresult;
7357 int i;
7358 int need_dict = 0;
7359 unsigned char level1[32];
7360 unsigned char level2[512];
7361 unsigned char *mlevel1, *mlevel2, *mlevel3;
7362 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007363 int kind;
7364 void *data;
7365 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007366
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007367 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007368 PyErr_BadArgument();
7369 return NULL;
7370 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007371 kind = PyUnicode_KIND(string);
7372 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007373 memset(level1, 0xFF, sizeof level1);
7374 memset(level2, 0xFF, sizeof level2);
7375
7376 /* If there isn't a one-to-one mapping of NULL to \0,
7377 or if there are non-BMP characters, we need to use
7378 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007379 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007380 need_dict = 1;
7381 for (i = 1; i < 256; i++) {
7382 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007383 ch = PyUnicode_READ(kind, data, i);
7384 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007385 need_dict = 1;
7386 break;
7387 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007388 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007389 /* unmapped character */
7390 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007391 l1 = ch >> 11;
7392 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007393 if (level1[l1] == 0xFF)
7394 level1[l1] = count2++;
7395 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007396 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007397 }
7398
7399 if (count2 >= 0xFF || count3 >= 0xFF)
7400 need_dict = 1;
7401
7402 if (need_dict) {
7403 PyObject *result = PyDict_New();
7404 PyObject *key, *value;
7405 if (!result)
7406 return NULL;
7407 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007408 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007409 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007410 if (!key || !value)
7411 goto failed1;
7412 if (PyDict_SetItem(result, key, value) == -1)
7413 goto failed1;
7414 Py_DECREF(key);
7415 Py_DECREF(value);
7416 }
7417 return result;
7418 failed1:
7419 Py_XDECREF(key);
7420 Py_XDECREF(value);
7421 Py_DECREF(result);
7422 return NULL;
7423 }
7424
7425 /* Create a three-level trie */
7426 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7427 16*count2 + 128*count3 - 1);
7428 if (!result)
7429 return PyErr_NoMemory();
7430 PyObject_Init(result, &EncodingMapType);
7431 mresult = (struct encoding_map*)result;
7432 mresult->count2 = count2;
7433 mresult->count3 = count3;
7434 mlevel1 = mresult->level1;
7435 mlevel2 = mresult->level23;
7436 mlevel3 = mresult->level23 + 16*count2;
7437 memcpy(mlevel1, level1, 32);
7438 memset(mlevel2, 0xFF, 16*count2);
7439 memset(mlevel3, 0, 128*count3);
7440 count3 = 0;
7441 for (i = 1; i < 256; i++) {
7442 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007443 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007444 /* unmapped character */
7445 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007446 o1 = PyUnicode_READ(kind, data, i)>>11;
7447 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007448 i2 = 16*mlevel1[o1] + o2;
7449 if (mlevel2[i2] == 0xFF)
7450 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007451 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007452 i3 = 128*mlevel2[i2] + o3;
7453 mlevel3[i3] = i;
7454 }
7455 return result;
7456}
7457
7458static int
7459encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
7460{
7461 struct encoding_map *map = (struct encoding_map*)mapping;
7462 int l1 = c>>11;
7463 int l2 = (c>>7) & 0xF;
7464 int l3 = c & 0x7F;
7465 int i;
7466
7467#ifdef Py_UNICODE_WIDE
7468 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007469 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007470 }
7471#endif
7472 if (c == 0)
7473 return 0;
7474 /* level 1*/
7475 i = map->level1[l1];
7476 if (i == 0xFF) {
7477 return -1;
7478 }
7479 /* level 2*/
7480 i = map->level23[16*i+l2];
7481 if (i == 0xFF) {
7482 return -1;
7483 }
7484 /* level 3 */
7485 i = map->level23[16*map->count2 + 128*i + l3];
7486 if (i == 0) {
7487 return -1;
7488 }
7489 return i;
7490}
7491
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007492/* Lookup the character ch in the mapping. If the character
7493 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007494 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007495static PyObject *
7496charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007497{
Christian Heimes217cfd12007-12-02 14:31:20 +00007498 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007499 PyObject *x;
7500
7501 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007502 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007503 x = PyObject_GetItem(mapping, w);
7504 Py_DECREF(w);
7505 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007506 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7507 /* No mapping found means: mapping is undefined. */
7508 PyErr_Clear();
7509 x = Py_None;
7510 Py_INCREF(x);
7511 return x;
7512 } else
7513 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007514 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007515 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007516 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007517 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007518 long value = PyLong_AS_LONG(x);
7519 if (value < 0 || value > 255) {
7520 PyErr_SetString(PyExc_TypeError,
7521 "character mapping must be in range(256)");
7522 Py_DECREF(x);
7523 return NULL;
7524 }
7525 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007526 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007527 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007528 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007529 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007530 /* wrong return value */
7531 PyErr_Format(PyExc_TypeError,
7532 "character mapping must return integer, bytes or None, not %.400s",
7533 x->ob_type->tp_name);
7534 Py_DECREF(x);
7535 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007536 }
7537}
7538
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007539static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007540charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007541{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007542 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7543 /* exponentially overallocate to minimize reallocations */
7544 if (requiredsize < 2*outsize)
7545 requiredsize = 2*outsize;
7546 if (_PyBytes_Resize(outobj, requiredsize))
7547 return -1;
7548 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007549}
7550
Benjamin Peterson14339b62009-01-31 16:36:08 +00007551typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007552 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007553} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007554/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007555 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007556 space is available. Return a new reference to the object that
7557 was put in the output buffer, or Py_None, if the mapping was undefined
7558 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007559 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007560static charmapencode_result
7561charmapencode_output(Py_UNICODE c, PyObject *mapping,
7562 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007563{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007564 PyObject *rep;
7565 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007566 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007567
Christian Heimes90aa7642007-12-19 02:45:37 +00007568 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007569 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007570 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007571 if (res == -1)
7572 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007573 if (outsize<requiredsize)
7574 if (charmapencode_resize(outobj, outpos, requiredsize))
7575 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007576 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007577 outstart[(*outpos)++] = (char)res;
7578 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007579 }
7580
7581 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007582 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007583 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007584 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007585 Py_DECREF(rep);
7586 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007587 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007588 if (PyLong_Check(rep)) {
7589 Py_ssize_t requiredsize = *outpos+1;
7590 if (outsize<requiredsize)
7591 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7592 Py_DECREF(rep);
7593 return enc_EXCEPTION;
7594 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007595 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007596 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007597 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007598 else {
7599 const char *repchars = PyBytes_AS_STRING(rep);
7600 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7601 Py_ssize_t requiredsize = *outpos+repsize;
7602 if (outsize<requiredsize)
7603 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7604 Py_DECREF(rep);
7605 return enc_EXCEPTION;
7606 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007607 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007608 memcpy(outstart + *outpos, repchars, repsize);
7609 *outpos += repsize;
7610 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007611 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007612 Py_DECREF(rep);
7613 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007614}
7615
7616/* handle an error in PyUnicode_EncodeCharmap
7617 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007618static int
7619charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00007620 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007621 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007622 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007623 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007624{
7625 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007626 Py_ssize_t repsize;
7627 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007628 Py_UNICODE *uni2;
7629 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007630 Py_ssize_t collstartpos = *inpos;
7631 Py_ssize_t collendpos = *inpos+1;
7632 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007633 char *encoding = "charmap";
7634 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007635 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007636
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007637 /* find all unencodable characters */
7638 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007639 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007640 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007641 int res = encoding_map_lookup(p[collendpos], mapping);
7642 if (res != -1)
7643 break;
7644 ++collendpos;
7645 continue;
7646 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007647
Benjamin Peterson29060642009-01-31 22:14:21 +00007648 rep = charmapencode_lookup(p[collendpos], mapping);
7649 if (rep==NULL)
7650 return -1;
7651 else if (rep!=Py_None) {
7652 Py_DECREF(rep);
7653 break;
7654 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007655 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007656 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007657 }
7658 /* cache callback name lookup
7659 * (if not done yet, i.e. it's the first error) */
7660 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007661 if ((errors==NULL) || (!strcmp(errors, "strict")))
7662 *known_errorHandler = 1;
7663 else if (!strcmp(errors, "replace"))
7664 *known_errorHandler = 2;
7665 else if (!strcmp(errors, "ignore"))
7666 *known_errorHandler = 3;
7667 else if (!strcmp(errors, "xmlcharrefreplace"))
7668 *known_errorHandler = 4;
7669 else
7670 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007671 }
7672 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007673 case 1: /* strict */
7674 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7675 return -1;
7676 case 2: /* replace */
7677 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007678 x = charmapencode_output('?', mapping, res, respos);
7679 if (x==enc_EXCEPTION) {
7680 return -1;
7681 }
7682 else if (x==enc_FAILED) {
7683 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7684 return -1;
7685 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007686 }
7687 /* fall through */
7688 case 3: /* ignore */
7689 *inpos = collendpos;
7690 break;
7691 case 4: /* xmlcharrefreplace */
7692 /* generate replacement (temporarily (mis)uses p) */
7693 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007694 char buffer[2+29+1+1];
7695 char *cp;
7696 sprintf(buffer, "&#%d;", (int)p[collpos]);
7697 for (cp = buffer; *cp; ++cp) {
7698 x = charmapencode_output(*cp, mapping, res, respos);
7699 if (x==enc_EXCEPTION)
7700 return -1;
7701 else if (x==enc_FAILED) {
7702 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7703 return -1;
7704 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007705 }
7706 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007707 *inpos = collendpos;
7708 break;
7709 default:
7710 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00007711 encoding, reason, p, size, exceptionObject,
7712 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007713 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007714 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007715 if (PyBytes_Check(repunicode)) {
7716 /* Directly copy bytes result to output. */
7717 Py_ssize_t outsize = PyBytes_Size(*res);
7718 Py_ssize_t requiredsize;
7719 repsize = PyBytes_Size(repunicode);
7720 requiredsize = *respos + repsize;
7721 if (requiredsize > outsize)
7722 /* Make room for all additional bytes. */
7723 if (charmapencode_resize(res, respos, requiredsize)) {
7724 Py_DECREF(repunicode);
7725 return -1;
7726 }
7727 memcpy(PyBytes_AsString(*res) + *respos,
7728 PyBytes_AsString(repunicode), repsize);
7729 *respos += repsize;
7730 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007731 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007732 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007733 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007734 /* generate replacement */
7735 repsize = PyUnicode_GET_SIZE(repunicode);
7736 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007737 x = charmapencode_output(*uni2, mapping, res, respos);
7738 if (x==enc_EXCEPTION) {
7739 return -1;
7740 }
7741 else if (x==enc_FAILED) {
7742 Py_DECREF(repunicode);
7743 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7744 return -1;
7745 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007746 }
7747 *inpos = newpos;
7748 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007749 }
7750 return 0;
7751}
7752
Alexander Belopolsky40018472011-02-26 01:02:56 +00007753PyObject *
7754PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7755 Py_ssize_t size,
7756 PyObject *mapping,
7757 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007758{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007759 /* output object */
7760 PyObject *res = NULL;
7761 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007762 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007763 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007764 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007765 PyObject *errorHandler = NULL;
7766 PyObject *exc = NULL;
7767 /* the following variable is used for caching string comparisons
7768 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7769 * 3=ignore, 4=xmlcharrefreplace */
7770 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007771
7772 /* Default to Latin-1 */
7773 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007774 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007775
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007776 /* allocate enough for a simple encoding without
7777 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007778 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007779 if (res == NULL)
7780 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007781 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007782 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007783
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007784 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007785 /* try to encode it */
7786 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7787 if (x==enc_EXCEPTION) /* error */
7788 goto onError;
7789 if (x==enc_FAILED) { /* unencodable character */
7790 if (charmap_encoding_error(p, size, &inpos, mapping,
7791 &exc,
7792 &known_errorHandler, &errorHandler, errors,
7793 &res, &respos)) {
7794 goto onError;
7795 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007796 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007797 else
7798 /* done with this character => adjust input position */
7799 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007800 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007801
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007802 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007803 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007804 if (_PyBytes_Resize(&res, respos) < 0)
7805 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007806
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007807 Py_XDECREF(exc);
7808 Py_XDECREF(errorHandler);
7809 return res;
7810
Benjamin Peterson29060642009-01-31 22:14:21 +00007811 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007812 Py_XDECREF(res);
7813 Py_XDECREF(exc);
7814 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007815 return NULL;
7816}
7817
Alexander Belopolsky40018472011-02-26 01:02:56 +00007818PyObject *
7819PyUnicode_AsCharmapString(PyObject *unicode,
7820 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007821{
7822 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007823 PyErr_BadArgument();
7824 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007825 }
7826 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007827 PyUnicode_GET_SIZE(unicode),
7828 mapping,
7829 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007830}
7831
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007832/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007833static void
7834make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007835 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007836 Py_ssize_t startpos, Py_ssize_t endpos,
7837 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007838{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007839 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007840 *exceptionObject = _PyUnicodeTranslateError_Create(
7841 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007842 }
7843 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007844 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7845 goto onError;
7846 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7847 goto onError;
7848 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7849 goto onError;
7850 return;
7851 onError:
7852 Py_DECREF(*exceptionObject);
7853 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007854 }
7855}
7856
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007857/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007858static void
7859raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007860 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007861 Py_ssize_t startpos, Py_ssize_t endpos,
7862 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007863{
7864 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007865 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007866 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007867 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007868}
7869
7870/* error handling callback helper:
7871 build arguments, call the callback and check the arguments,
7872 put the result into newpos and return the replacement string, which
7873 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007874static PyObject *
7875unicode_translate_call_errorhandler(const char *errors,
7876 PyObject **errorHandler,
7877 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007878 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007879 Py_ssize_t startpos, Py_ssize_t endpos,
7880 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007881{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007882 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007883
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007884 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007885 PyObject *restuple;
7886 PyObject *resunicode;
7887
7888 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007889 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007890 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007891 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007892 }
7893
7894 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007895 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007896 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007897 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007898
7899 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007900 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007901 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007902 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007903 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007904 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007905 Py_DECREF(restuple);
7906 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007907 }
7908 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007909 &resunicode, &i_newpos)) {
7910 Py_DECREF(restuple);
7911 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007912 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007913 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007914 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007915 else
7916 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007917 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007918 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7919 Py_DECREF(restuple);
7920 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007921 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007922 Py_INCREF(resunicode);
7923 Py_DECREF(restuple);
7924 return resunicode;
7925}
7926
7927/* Lookup the character ch in the mapping and put the result in result,
7928 which must be decrefed by the caller.
7929 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007930static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007931charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007932{
Christian Heimes217cfd12007-12-02 14:31:20 +00007933 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007934 PyObject *x;
7935
7936 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007937 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007938 x = PyObject_GetItem(mapping, w);
7939 Py_DECREF(w);
7940 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007941 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7942 /* No mapping found means: use 1:1 mapping. */
7943 PyErr_Clear();
7944 *result = NULL;
7945 return 0;
7946 } else
7947 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007948 }
7949 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007950 *result = x;
7951 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007952 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007953 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007954 long value = PyLong_AS_LONG(x);
7955 long max = PyUnicode_GetMax();
7956 if (value < 0 || value > max) {
7957 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007958 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007959 Py_DECREF(x);
7960 return -1;
7961 }
7962 *result = x;
7963 return 0;
7964 }
7965 else if (PyUnicode_Check(x)) {
7966 *result = x;
7967 return 0;
7968 }
7969 else {
7970 /* wrong return value */
7971 PyErr_SetString(PyExc_TypeError,
7972 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007973 Py_DECREF(x);
7974 return -1;
7975 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007976}
7977/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00007978 if not reallocate and adjust various state variables.
7979 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007980static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007981charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00007982 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007983{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007984 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00007985 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007986 /* exponentially overallocate to minimize reallocations */
7987 if (requiredsize < 2 * oldsize)
7988 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007989 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7990 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007991 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007992 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007993 }
7994 return 0;
7995}
7996/* lookup the character, put the result in the output string and adjust
7997 various state variables. Return a new reference to the object that
7998 was put in the output buffer in *result, or Py_None, if the mapping was
7999 undefined (in which case no character was written).
8000 The called must decref result.
8001 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008002static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008003charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8004 PyObject *mapping, Py_UCS4 **output,
8005 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008006 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008007{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008008 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8009 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008010 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008011 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008012 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008013 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008014 }
8015 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008016 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008017 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008018 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008019 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008020 }
8021 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008022 Py_ssize_t repsize;
8023 if (PyUnicode_READY(*res) == -1)
8024 return -1;
8025 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008026 if (repsize==1) {
8027 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008028 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008029 }
8030 else if (repsize!=0) {
8031 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008032 Py_ssize_t requiredsize = *opos +
8033 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008034 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008035 Py_ssize_t i;
8036 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008037 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008038 for(i = 0; i < repsize; i++)
8039 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008040 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008041 }
8042 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008043 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008044 return 0;
8045}
8046
Alexander Belopolsky40018472011-02-26 01:02:56 +00008047PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008048_PyUnicode_TranslateCharmap(PyObject *input,
8049 PyObject *mapping,
8050 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008051{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008052 /* input object */
8053 char *idata;
8054 Py_ssize_t size, i;
8055 int kind;
8056 /* output buffer */
8057 Py_UCS4 *output = NULL;
8058 Py_ssize_t osize;
8059 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008060 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008061 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008062 char *reason = "character maps to <undefined>";
8063 PyObject *errorHandler = NULL;
8064 PyObject *exc = NULL;
8065 /* the following variable is used for caching string comparisons
8066 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8067 * 3=ignore, 4=xmlcharrefreplace */
8068 int known_errorHandler = -1;
8069
Guido van Rossumd57fd912000-03-10 22:53:23 +00008070 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008071 PyErr_BadArgument();
8072 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008073 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008074
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008075 if (PyUnicode_READY(input) == -1)
8076 return NULL;
8077 idata = (char*)PyUnicode_DATA(input);
8078 kind = PyUnicode_KIND(input);
8079 size = PyUnicode_GET_LENGTH(input);
8080 i = 0;
8081
8082 if (size == 0) {
8083 Py_INCREF(input);
8084 return input;
8085 }
8086
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008087 /* allocate enough for a simple 1:1 translation without
8088 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008089 osize = size;
8090 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8091 opos = 0;
8092 if (output == NULL) {
8093 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008094 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008095 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008096
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008097 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008098 /* try to encode it */
8099 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008100 if (charmaptranslate_output(input, i, mapping,
8101 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008102 Py_XDECREF(x);
8103 goto onError;
8104 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008105 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008106 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008107 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008108 else { /* untranslatable character */
8109 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8110 Py_ssize_t repsize;
8111 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008112 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008113 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008114 Py_ssize_t collstart = i;
8115 Py_ssize_t collend = i+1;
8116 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008117
Benjamin Peterson29060642009-01-31 22:14:21 +00008118 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008119 while (collend < size) {
8120 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008121 goto onError;
8122 Py_XDECREF(x);
8123 if (x!=Py_None)
8124 break;
8125 ++collend;
8126 }
8127 /* cache callback name lookup
8128 * (if not done yet, i.e. it's the first error) */
8129 if (known_errorHandler==-1) {
8130 if ((errors==NULL) || (!strcmp(errors, "strict")))
8131 known_errorHandler = 1;
8132 else if (!strcmp(errors, "replace"))
8133 known_errorHandler = 2;
8134 else if (!strcmp(errors, "ignore"))
8135 known_errorHandler = 3;
8136 else if (!strcmp(errors, "xmlcharrefreplace"))
8137 known_errorHandler = 4;
8138 else
8139 known_errorHandler = 0;
8140 }
8141 switch (known_errorHandler) {
8142 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008143 raise_translate_exception(&exc, input, collstart,
8144 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008145 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008146 case 2: /* replace */
8147 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008148 for (coll = collstart; coll<collend; coll++)
8149 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008150 /* fall through */
8151 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008152 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008153 break;
8154 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008155 /* generate replacement (temporarily (mis)uses i) */
8156 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008157 char buffer[2+29+1+1];
8158 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008159 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8160 if (charmaptranslate_makespace(&output, &osize,
8161 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008162 goto onError;
8163 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008164 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008165 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008166 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008167 break;
8168 default:
8169 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008170 reason, input, &exc,
8171 collstart, collend, &newpos);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02008172 if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00008173 goto onError;
8174 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008175 repsize = PyUnicode_GET_LENGTH(repunicode);
8176 if (charmaptranslate_makespace(&output, &osize,
8177 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008178 Py_DECREF(repunicode);
8179 goto onError;
8180 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008181 for (uni2 = 0; repsize-->0; ++uni2)
8182 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8183 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008184 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008185 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008186 }
8187 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008188 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8189 if (!res)
8190 goto onError;
8191 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008192 Py_XDECREF(exc);
8193 Py_XDECREF(errorHandler);
8194 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008195
Benjamin Peterson29060642009-01-31 22:14:21 +00008196 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008197 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008198 Py_XDECREF(exc);
8199 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008200 return NULL;
8201}
8202
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008203/* Deprecated. Use PyUnicode_Translate instead. */
8204PyObject *
8205PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8206 Py_ssize_t size,
8207 PyObject *mapping,
8208 const char *errors)
8209{
8210 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8211 if (!unicode)
8212 return NULL;
8213 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8214}
8215
Alexander Belopolsky40018472011-02-26 01:02:56 +00008216PyObject *
8217PyUnicode_Translate(PyObject *str,
8218 PyObject *mapping,
8219 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008220{
8221 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008222
Guido van Rossumd57fd912000-03-10 22:53:23 +00008223 str = PyUnicode_FromObject(str);
8224 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008225 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008226 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008227 Py_DECREF(str);
8228 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008229
Benjamin Peterson29060642009-01-31 22:14:21 +00008230 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008231 Py_XDECREF(str);
8232 return NULL;
8233}
Tim Petersced69f82003-09-16 20:30:58 +00008234
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008235static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008236fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008237{
8238 /* No need to call PyUnicode_READY(self) because this function is only
8239 called as a callback from fixup() which does it already. */
8240 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8241 const int kind = PyUnicode_KIND(self);
8242 void *data = PyUnicode_DATA(self);
8243 Py_UCS4 maxchar = 0, ch, fixed;
8244 Py_ssize_t i;
8245
8246 for (i = 0; i < len; ++i) {
8247 ch = PyUnicode_READ(kind, data, i);
8248 fixed = 0;
8249 if (ch > 127) {
8250 if (Py_UNICODE_ISSPACE(ch))
8251 fixed = ' ';
8252 else {
8253 const int decimal = Py_UNICODE_TODECIMAL(ch);
8254 if (decimal >= 0)
8255 fixed = '0' + decimal;
8256 }
8257 if (fixed != 0) {
8258 if (fixed > maxchar)
8259 maxchar = fixed;
8260 PyUnicode_WRITE(kind, data, i, fixed);
8261 }
8262 else if (ch > maxchar)
8263 maxchar = ch;
8264 }
8265 else if (ch > maxchar)
8266 maxchar = ch;
8267 }
8268
8269 return maxchar;
8270}
8271
8272PyObject *
8273_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8274{
8275 if (!PyUnicode_Check(unicode)) {
8276 PyErr_BadInternalCall();
8277 return NULL;
8278 }
8279 if (PyUnicode_READY(unicode) == -1)
8280 return NULL;
8281 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8282 /* If the string is already ASCII, just return the same string */
8283 Py_INCREF(unicode);
8284 return unicode;
8285 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008286 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008287}
8288
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008289PyObject *
8290PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8291 Py_ssize_t length)
8292{
8293 PyObject *result;
8294 Py_UNICODE *p; /* write pointer into result */
8295 Py_ssize_t i;
8296 /* Copy to a new string */
8297 result = (PyObject *)_PyUnicode_New(length);
8298 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
8299 if (result == NULL)
8300 return result;
8301 p = PyUnicode_AS_UNICODE(result);
8302 /* Iterate over code points */
8303 for (i = 0; i < length; i++) {
8304 Py_UNICODE ch =s[i];
8305 if (ch > 127) {
8306 int decimal = Py_UNICODE_TODECIMAL(ch);
8307 if (decimal >= 0)
8308 p[i] = '0' + decimal;
8309 }
8310 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008311#ifndef DONT_MAKE_RESULT_READY
8312 if (_PyUnicode_READY_REPLACE(&result)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008313 Py_DECREF(result);
8314 return NULL;
8315 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008316#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02008317 assert(_PyUnicode_CheckConsistency(result, 1));
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008318 return result;
8319}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008320/* --- Decimal Encoder ---------------------------------------------------- */
8321
Alexander Belopolsky40018472011-02-26 01:02:56 +00008322int
8323PyUnicode_EncodeDecimal(Py_UNICODE *s,
8324 Py_ssize_t length,
8325 char *output,
8326 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008327{
8328 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008329 PyObject *errorHandler = NULL;
8330 PyObject *exc = NULL;
8331 const char *encoding = "decimal";
8332 const char *reason = "invalid decimal Unicode string";
8333 /* the following variable is used for caching string comparisons
8334 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
8335 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008336
8337 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008338 PyErr_BadArgument();
8339 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008340 }
8341
8342 p = s;
8343 end = s + length;
8344 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008345 register Py_UNICODE ch = *p;
8346 int decimal;
8347 PyObject *repunicode;
8348 Py_ssize_t repsize;
8349 Py_ssize_t newpos;
8350 Py_UNICODE *uni2;
8351 Py_UNICODE *collstart;
8352 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00008353
Benjamin Peterson29060642009-01-31 22:14:21 +00008354 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008355 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00008356 ++p;
8357 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008358 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008359 decimal = Py_UNICODE_TODECIMAL(ch);
8360 if (decimal >= 0) {
8361 *output++ = '0' + decimal;
8362 ++p;
8363 continue;
8364 }
8365 if (0 < ch && ch < 256) {
8366 *output++ = (char)ch;
8367 ++p;
8368 continue;
8369 }
8370 /* All other characters are considered unencodable */
8371 collstart = p;
8372 collend = p+1;
8373 while (collend < end) {
8374 if ((0 < *collend && *collend < 256) ||
8375 !Py_UNICODE_ISSPACE(*collend) ||
8376 Py_UNICODE_TODECIMAL(*collend))
8377 break;
8378 }
8379 /* cache callback name lookup
8380 * (if not done yet, i.e. it's the first error) */
8381 if (known_errorHandler==-1) {
8382 if ((errors==NULL) || (!strcmp(errors, "strict")))
8383 known_errorHandler = 1;
8384 else if (!strcmp(errors, "replace"))
8385 known_errorHandler = 2;
8386 else if (!strcmp(errors, "ignore"))
8387 known_errorHandler = 3;
8388 else if (!strcmp(errors, "xmlcharrefreplace"))
8389 known_errorHandler = 4;
8390 else
8391 known_errorHandler = 0;
8392 }
8393 switch (known_errorHandler) {
8394 case 1: /* strict */
8395 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
8396 goto onError;
8397 case 2: /* replace */
8398 for (p = collstart; p < collend; ++p)
8399 *output++ = '?';
8400 /* fall through */
8401 case 3: /* ignore */
8402 p = collend;
8403 break;
8404 case 4: /* xmlcharrefreplace */
8405 /* generate replacement (temporarily (mis)uses p) */
8406 for (p = collstart; p < collend; ++p)
8407 output += sprintf(output, "&#%d;", (int)*p);
8408 p = collend;
8409 break;
8410 default:
8411 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
8412 encoding, reason, s, length, &exc,
8413 collstart-s, collend-s, &newpos);
8414 if (repunicode == NULL)
8415 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008416 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008417 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008418 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8419 Py_DECREF(repunicode);
8420 goto onError;
8421 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008422 /* generate replacement */
8423 repsize = PyUnicode_GET_SIZE(repunicode);
8424 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8425 Py_UNICODE ch = *uni2;
8426 if (Py_UNICODE_ISSPACE(ch))
8427 *output++ = ' ';
8428 else {
8429 decimal = Py_UNICODE_TODECIMAL(ch);
8430 if (decimal >= 0)
8431 *output++ = '0' + decimal;
8432 else if (0 < ch && ch < 256)
8433 *output++ = (char)ch;
8434 else {
8435 Py_DECREF(repunicode);
8436 raise_encode_exception(&exc, encoding,
8437 s, length, collstart-s, collend-s, reason);
8438 goto onError;
8439 }
8440 }
8441 }
8442 p = s + newpos;
8443 Py_DECREF(repunicode);
8444 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00008445 }
8446 /* 0-terminate the output string */
8447 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008448 Py_XDECREF(exc);
8449 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008450 return 0;
8451
Benjamin Peterson29060642009-01-31 22:14:21 +00008452 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008453 Py_XDECREF(exc);
8454 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008455 return -1;
8456}
8457
Guido van Rossumd57fd912000-03-10 22:53:23 +00008458/* --- Helpers ------------------------------------------------------------ */
8459
Victor Stinnerc3cec782011-10-05 21:24:08 +02008460#include "stringlib/asciilib.h"
8461#include "stringlib/fastsearch.h"
8462#include "stringlib/partition.h"
8463#include "stringlib/split.h"
8464#include "stringlib/count.h"
8465#include "stringlib/find.h"
8466#include "stringlib/localeutil.h"
8467#include "stringlib/undef.h"
8468
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008469#include "stringlib/ucs1lib.h"
8470#include "stringlib/fastsearch.h"
8471#include "stringlib/partition.h"
8472#include "stringlib/split.h"
8473#include "stringlib/count.h"
8474#include "stringlib/find.h"
8475#include "stringlib/localeutil.h"
8476#include "stringlib/undef.h"
8477
8478#include "stringlib/ucs2lib.h"
8479#include "stringlib/fastsearch.h"
8480#include "stringlib/partition.h"
8481#include "stringlib/split.h"
8482#include "stringlib/count.h"
8483#include "stringlib/find.h"
8484#include "stringlib/localeutil.h"
8485#include "stringlib/undef.h"
8486
8487#include "stringlib/ucs4lib.h"
8488#include "stringlib/fastsearch.h"
8489#include "stringlib/partition.h"
8490#include "stringlib/split.h"
8491#include "stringlib/count.h"
8492#include "stringlib/find.h"
8493#include "stringlib/localeutil.h"
8494#include "stringlib/undef.h"
8495
8496static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02008497any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ascii)(const Py_UCS1*, Py_ssize_t,
8498 const Py_UCS1*, Py_ssize_t,
8499 Py_ssize_t, Py_ssize_t),
8500 Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008501 const Py_UCS1*, Py_ssize_t,
8502 Py_ssize_t, Py_ssize_t),
8503 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
8504 const Py_UCS2*, Py_ssize_t,
8505 Py_ssize_t, Py_ssize_t),
8506 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
8507 const Py_UCS4*, Py_ssize_t,
8508 Py_ssize_t, Py_ssize_t),
8509 PyObject* s1, PyObject* s2,
8510 Py_ssize_t start,
8511 Py_ssize_t end)
8512{
8513 int kind1, kind2, kind;
8514 void *buf1, *buf2;
8515 Py_ssize_t len1, len2, result;
8516
8517 kind1 = PyUnicode_KIND(s1);
8518 kind2 = PyUnicode_KIND(s2);
8519 kind = kind1 > kind2 ? kind1 : kind2;
8520 buf1 = PyUnicode_DATA(s1);
8521 buf2 = PyUnicode_DATA(s2);
8522 if (kind1 != kind)
8523 buf1 = _PyUnicode_AsKind(s1, kind);
8524 if (!buf1)
8525 return -2;
8526 if (kind2 != kind)
8527 buf2 = _PyUnicode_AsKind(s2, kind);
8528 if (!buf2) {
8529 if (kind1 != kind) PyMem_Free(buf1);
8530 return -2;
8531 }
8532 len1 = PyUnicode_GET_LENGTH(s1);
8533 len2 = PyUnicode_GET_LENGTH(s2);
8534
8535 switch(kind) {
8536 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008537 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8538 result = ascii(buf1, len1, buf2, len2, start, end);
8539 else
8540 result = ucs1(buf1, len1, buf2, len2, start, end);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008541 break;
8542 case PyUnicode_2BYTE_KIND:
8543 result = ucs2(buf1, len1, buf2, len2, start, end);
8544 break;
8545 case PyUnicode_4BYTE_KIND:
8546 result = ucs4(buf1, len1, buf2, len2, start, end);
8547 break;
8548 default:
8549 assert(0); result = -2;
8550 }
8551
8552 if (kind1 != kind)
8553 PyMem_Free(buf1);
8554 if (kind2 != kind)
8555 PyMem_Free(buf2);
8556
8557 return result;
8558}
8559
8560Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02008561_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008562 Py_ssize_t n_buffer,
8563 void *digits, Py_ssize_t n_digits,
8564 Py_ssize_t min_width,
8565 const char *grouping,
8566 const char *thousands_sep)
8567{
8568 switch(kind) {
8569 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008570 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
8571 return _PyUnicode_ascii_InsertThousandsGrouping(
8572 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8573 min_width, grouping, thousands_sep);
8574 else
8575 return _PyUnicode_ucs1_InsertThousandsGrouping(
8576 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8577 min_width, grouping, thousands_sep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008578 case PyUnicode_2BYTE_KIND:
8579 return _PyUnicode_ucs2_InsertThousandsGrouping(
8580 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8581 min_width, grouping, thousands_sep);
8582 case PyUnicode_4BYTE_KIND:
8583 return _PyUnicode_ucs4_InsertThousandsGrouping(
8584 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8585 min_width, grouping, thousands_sep);
8586 }
8587 assert(0);
8588 return -1;
8589}
8590
8591
Eric Smith8c663262007-08-25 02:26:07 +00008592#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00008593#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008594
Thomas Wouters477c8d52006-05-27 19:21:47 +00008595#include "stringlib/count.h"
8596#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00008597
Thomas Wouters477c8d52006-05-27 19:21:47 +00008598/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008599#define ADJUST_INDICES(start, end, len) \
8600 if (end > len) \
8601 end = len; \
8602 else if (end < 0) { \
8603 end += len; \
8604 if (end < 0) \
8605 end = 0; \
8606 } \
8607 if (start < 0) { \
8608 start += len; \
8609 if (start < 0) \
8610 start = 0; \
8611 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008612
Alexander Belopolsky40018472011-02-26 01:02:56 +00008613Py_ssize_t
8614PyUnicode_Count(PyObject *str,
8615 PyObject *substr,
8616 Py_ssize_t start,
8617 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008618{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008619 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008620 PyUnicodeObject* str_obj;
8621 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008622 int kind1, kind2, kind;
8623 void *buf1 = NULL, *buf2 = NULL;
8624 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008625
Thomas Wouters477c8d52006-05-27 19:21:47 +00008626 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008627 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008628 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008629 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02008630 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008631 Py_DECREF(str_obj);
8632 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008633 }
Tim Petersced69f82003-09-16 20:30:58 +00008634
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008635 kind1 = PyUnicode_KIND(str_obj);
8636 kind2 = PyUnicode_KIND(sub_obj);
8637 kind = kind1 > kind2 ? kind1 : kind2;
8638 buf1 = PyUnicode_DATA(str_obj);
8639 if (kind1 != kind)
8640 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
8641 if (!buf1)
8642 goto onError;
8643 buf2 = PyUnicode_DATA(sub_obj);
8644 if (kind2 != kind)
8645 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
8646 if (!buf2)
8647 goto onError;
8648 len1 = PyUnicode_GET_LENGTH(str_obj);
8649 len2 = PyUnicode_GET_LENGTH(sub_obj);
8650
8651 ADJUST_INDICES(start, end, len1);
8652 switch(kind) {
8653 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008654 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
8655 result = asciilib_count(
8656 ((Py_UCS1*)buf1) + start, end - start,
8657 buf2, len2, PY_SSIZE_T_MAX
8658 );
8659 else
8660 result = ucs1lib_count(
8661 ((Py_UCS1*)buf1) + start, end - start,
8662 buf2, len2, PY_SSIZE_T_MAX
8663 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008664 break;
8665 case PyUnicode_2BYTE_KIND:
8666 result = ucs2lib_count(
8667 ((Py_UCS2*)buf1) + start, end - start,
8668 buf2, len2, PY_SSIZE_T_MAX
8669 );
8670 break;
8671 case PyUnicode_4BYTE_KIND:
8672 result = ucs4lib_count(
8673 ((Py_UCS4*)buf1) + start, end - start,
8674 buf2, len2, PY_SSIZE_T_MAX
8675 );
8676 break;
8677 default:
8678 assert(0); result = 0;
8679 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008680
8681 Py_DECREF(sub_obj);
8682 Py_DECREF(str_obj);
8683
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008684 if (kind1 != kind)
8685 PyMem_Free(buf1);
8686 if (kind2 != kind)
8687 PyMem_Free(buf2);
8688
Guido van Rossumd57fd912000-03-10 22:53:23 +00008689 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008690 onError:
8691 Py_DECREF(sub_obj);
8692 Py_DECREF(str_obj);
8693 if (kind1 != kind && buf1)
8694 PyMem_Free(buf1);
8695 if (kind2 != kind && buf2)
8696 PyMem_Free(buf2);
8697 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008698}
8699
Alexander Belopolsky40018472011-02-26 01:02:56 +00008700Py_ssize_t
8701PyUnicode_Find(PyObject *str,
8702 PyObject *sub,
8703 Py_ssize_t start,
8704 Py_ssize_t end,
8705 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008706{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008707 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008708
Guido van Rossumd57fd912000-03-10 22:53:23 +00008709 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008710 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008711 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008712 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008713 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008714 Py_DECREF(str);
8715 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008716 }
Tim Petersced69f82003-09-16 20:30:58 +00008717
Thomas Wouters477c8d52006-05-27 19:21:47 +00008718 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008719 result = any_find_slice(
Victor Stinnerc3cec782011-10-05 21:24:08 +02008720 asciilib_find_slice, ucs1lib_find_slice,
8721 ucs2lib_find_slice, ucs4lib_find_slice,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008722 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008723 );
8724 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008725 result = any_find_slice(
Victor Stinnerc3cec782011-10-05 21:24:08 +02008726 asciilib_find_slice, ucs1lib_rfind_slice,
8727 ucs2lib_rfind_slice, ucs4lib_rfind_slice,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008728 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008729 );
8730
Guido van Rossumd57fd912000-03-10 22:53:23 +00008731 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008732 Py_DECREF(sub);
8733
Guido van Rossumd57fd912000-03-10 22:53:23 +00008734 return result;
8735}
8736
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008737Py_ssize_t
8738PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8739 Py_ssize_t start, Py_ssize_t end,
8740 int direction)
8741{
8742 char *result;
8743 int kind;
8744 if (PyUnicode_READY(str) == -1)
8745 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008746 if (start < 0 || end < 0) {
8747 PyErr_SetString(PyExc_IndexError, "string index out of range");
8748 return -2;
8749 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008750 if (end > PyUnicode_GET_LENGTH(str))
8751 end = PyUnicode_GET_LENGTH(str);
8752 kind = PyUnicode_KIND(str);
8753 result = findchar(PyUnicode_1BYTE_DATA(str)
8754 + PyUnicode_KIND_SIZE(kind, start),
8755 kind,
8756 end-start, ch, direction);
8757 if (!result)
8758 return -1;
8759 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8760}
8761
Alexander Belopolsky40018472011-02-26 01:02:56 +00008762static int
8763tailmatch(PyUnicodeObject *self,
8764 PyUnicodeObject *substring,
8765 Py_ssize_t start,
8766 Py_ssize_t end,
8767 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008768{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008769 int kind_self;
8770 int kind_sub;
8771 void *data_self;
8772 void *data_sub;
8773 Py_ssize_t offset;
8774 Py_ssize_t i;
8775 Py_ssize_t end_sub;
8776
8777 if (PyUnicode_READY(self) == -1 ||
8778 PyUnicode_READY(substring) == -1)
8779 return 0;
8780
8781 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008782 return 1;
8783
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008784 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8785 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008786 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008787 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008788
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008789 kind_self = PyUnicode_KIND(self);
8790 data_self = PyUnicode_DATA(self);
8791 kind_sub = PyUnicode_KIND(substring);
8792 data_sub = PyUnicode_DATA(substring);
8793 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8794
8795 if (direction > 0)
8796 offset = end;
8797 else
8798 offset = start;
8799
8800 if (PyUnicode_READ(kind_self, data_self, offset) ==
8801 PyUnicode_READ(kind_sub, data_sub, 0) &&
8802 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8803 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8804 /* If both are of the same kind, memcmp is sufficient */
8805 if (kind_self == kind_sub) {
8806 return ! memcmp((char *)data_self +
8807 (offset * PyUnicode_CHARACTER_SIZE(substring)),
8808 data_sub,
8809 PyUnicode_GET_LENGTH(substring) *
8810 PyUnicode_CHARACTER_SIZE(substring));
8811 }
8812 /* otherwise we have to compare each character by first accesing it */
8813 else {
8814 /* We do not need to compare 0 and len(substring)-1 because
8815 the if statement above ensured already that they are equal
8816 when we end up here. */
8817 // TODO: honor direction and do a forward or backwards search
8818 for (i = 1; i < end_sub; ++i) {
8819 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8820 PyUnicode_READ(kind_sub, data_sub, i))
8821 return 0;
8822 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008823 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008824 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008825 }
8826
8827 return 0;
8828}
8829
Alexander Belopolsky40018472011-02-26 01:02:56 +00008830Py_ssize_t
8831PyUnicode_Tailmatch(PyObject *str,
8832 PyObject *substr,
8833 Py_ssize_t start,
8834 Py_ssize_t end,
8835 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008836{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008837 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008838
Guido van Rossumd57fd912000-03-10 22:53:23 +00008839 str = PyUnicode_FromObject(str);
8840 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008841 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008842 substr = PyUnicode_FromObject(substr);
8843 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008844 Py_DECREF(str);
8845 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008846 }
Tim Petersced69f82003-09-16 20:30:58 +00008847
Guido van Rossumd57fd912000-03-10 22:53:23 +00008848 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008849 (PyUnicodeObject *)substr,
8850 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008851 Py_DECREF(str);
8852 Py_DECREF(substr);
8853 return result;
8854}
8855
Guido van Rossumd57fd912000-03-10 22:53:23 +00008856/* Apply fixfct filter to the Unicode object self and return a
8857 reference to the modified object */
8858
Alexander Belopolsky40018472011-02-26 01:02:56 +00008859static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02008860fixup(PyObject *self,
8861 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008862{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008863 PyObject *u;
8864 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008865
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008866 if (PyUnicode_READY(self) == -1)
8867 return NULL;
8868 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8869 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8870 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008871 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008872 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008873
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008874 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8875 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008876
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008877 /* fix functions return the new maximum character in a string,
8878 if the kind of the resulting unicode object does not change,
8879 everything is fine. Otherwise we need to change the string kind
8880 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02008881 maxchar_new = fixfct(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008882 if (maxchar_new == 0)
8883 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8884 else if (maxchar_new <= 127)
8885 maxchar_new = 127;
8886 else if (maxchar_new <= 255)
8887 maxchar_new = 255;
8888 else if (maxchar_new <= 65535)
8889 maxchar_new = 65535;
8890 else
8891 maxchar_new = 1114111; /* 0x10ffff */
8892
8893 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008894 /* fixfct should return TRUE if it modified the buffer. If
8895 FALSE, return a reference to the original buffer instead
8896 (to save space, not time) */
8897 Py_INCREF(self);
8898 Py_DECREF(u);
8899 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008900 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008901 else if (maxchar_new == maxchar_old) {
8902 return u;
8903 }
8904 else {
8905 /* In case the maximum character changed, we need to
8906 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008907 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008908 if (v == NULL) {
8909 Py_DECREF(u);
8910 return NULL;
8911 }
8912 if (maxchar_new > maxchar_old) {
8913 /* If the maxchar increased so that the kind changed, not all
8914 characters are representable anymore and we need to fix the
8915 string again. This only happens in very few cases. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02008916 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner9310abb2011-10-05 00:59:23 +02008917 maxchar_old = fixfct(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008918 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8919 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008920 else {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02008921 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008922 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008923
8924 Py_DECREF(u);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02008925 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008926 return v;
8927 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008928}
8929
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008930static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008931fixupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008932{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008933 /* No need to call PyUnicode_READY(self) because this function is only
8934 called as a callback from fixup() which does it already. */
8935 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8936 const int kind = PyUnicode_KIND(self);
8937 void *data = PyUnicode_DATA(self);
8938 int touched = 0;
8939 Py_UCS4 maxchar = 0;
8940 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008941
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008942 for (i = 0; i < len; ++i) {
8943 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8944 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8945 if (up != ch) {
8946 if (up > maxchar)
8947 maxchar = up;
8948 PyUnicode_WRITE(kind, data, i, up);
8949 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008950 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008951 else if (ch > maxchar)
8952 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008953 }
8954
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008955 if (touched)
8956 return maxchar;
8957 else
8958 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008959}
8960
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008961static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008962fixlower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008963{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008964 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8965 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8966 const int kind = PyUnicode_KIND(self);
8967 void *data = PyUnicode_DATA(self);
8968 int touched = 0;
8969 Py_UCS4 maxchar = 0;
8970 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008971
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008972 for(i = 0; i < len; ++i) {
8973 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8974 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8975 if (lo != ch) {
8976 if (lo > maxchar)
8977 maxchar = lo;
8978 PyUnicode_WRITE(kind, data, i, lo);
8979 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008980 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008981 else if (ch > maxchar)
8982 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008983 }
8984
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008985 if (touched)
8986 return maxchar;
8987 else
8988 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008989}
8990
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008991static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008992fixswapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008993{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008994 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8995 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8996 const int kind = PyUnicode_KIND(self);
8997 void *data = PyUnicode_DATA(self);
8998 int touched = 0;
8999 Py_UCS4 maxchar = 0;
9000 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009001
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009002 for(i = 0; i < len; ++i) {
9003 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9004 Py_UCS4 nu = 0;
9005
9006 if (Py_UNICODE_ISUPPER(ch))
9007 nu = Py_UNICODE_TOLOWER(ch);
9008 else if (Py_UNICODE_ISLOWER(ch))
9009 nu = Py_UNICODE_TOUPPER(ch);
9010
9011 if (nu != 0) {
9012 if (nu > maxchar)
9013 maxchar = nu;
9014 PyUnicode_WRITE(kind, data, i, nu);
9015 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009016 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009017 else if (ch > maxchar)
9018 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009019 }
9020
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009021 if (touched)
9022 return maxchar;
9023 else
9024 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009025}
9026
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009027static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009028fixcapitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009029{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009030 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9031 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9032 const int kind = PyUnicode_KIND(self);
9033 void *data = PyUnicode_DATA(self);
9034 int touched = 0;
9035 Py_UCS4 maxchar = 0;
9036 Py_ssize_t i = 0;
9037 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00009038
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009039 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009040 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009041
9042 ch = PyUnicode_READ(kind, data, i);
9043 if (!Py_UNICODE_ISUPPER(ch)) {
9044 maxchar = Py_UNICODE_TOUPPER(ch);
9045 PyUnicode_WRITE(kind, data, i, maxchar);
9046 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009047 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009048 ++i;
9049 for(; i < len; ++i) {
9050 ch = PyUnicode_READ(kind, data, i);
9051 if (!Py_UNICODE_ISLOWER(ch)) {
9052 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9053 if (lo > maxchar)
9054 maxchar = lo;
9055 PyUnicode_WRITE(kind, data, i, lo);
9056 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009057 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009058 else if (ch > maxchar)
9059 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009060 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009061
9062 if (touched)
9063 return maxchar;
9064 else
9065 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009066}
9067
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009068static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009069fixtitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009070{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009071 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9072 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9073 const int kind = PyUnicode_KIND(self);
9074 void *data = PyUnicode_DATA(self);
9075 Py_UCS4 maxchar = 0;
9076 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009077 int previous_is_cased;
9078
9079 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009080 if (len == 1) {
9081 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9082 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
9083 if (ti != ch) {
9084 PyUnicode_WRITE(kind, data, i, ti);
9085 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00009086 }
9087 else
9088 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009089 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009090 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009091 for(; i < len; ++i) {
9092 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9093 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00009094
Benjamin Peterson29060642009-01-31 22:14:21 +00009095 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009096 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00009097 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009098 nu = Py_UNICODE_TOTITLE(ch);
9099
9100 if (nu > maxchar)
9101 maxchar = nu;
9102 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00009103
Benjamin Peterson29060642009-01-31 22:14:21 +00009104 if (Py_UNICODE_ISLOWER(ch) ||
9105 Py_UNICODE_ISUPPER(ch) ||
9106 Py_UNICODE_ISTITLE(ch))
9107 previous_is_cased = 1;
9108 else
9109 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009110 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009111 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009112}
9113
Tim Peters8ce9f162004-08-27 01:49:32 +00009114PyObject *
9115PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009116{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009117 PyObject *sep = NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009118 Py_ssize_t seplen = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009119 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009120 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009121 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9122 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009123 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009124 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009125 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009126 Py_UCS4 item_maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009127
Tim Peters05eba1f2004-08-27 21:32:02 +00009128 fseq = PySequence_Fast(seq, "");
9129 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009130 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009131 }
9132
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009133 /* NOTE: the following code can't call back into Python code,
9134 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009135 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009136
Tim Peters05eba1f2004-08-27 21:32:02 +00009137 seqlen = PySequence_Fast_GET_SIZE(fseq);
9138 /* If empty sequence, return u"". */
9139 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009140 Py_DECREF(fseq);
9141 Py_INCREF(unicode_empty);
9142 res = unicode_empty;
9143 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009144 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009145
Tim Peters05eba1f2004-08-27 21:32:02 +00009146 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009147 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009148 if (seqlen == 1) {
9149 if (PyUnicode_CheckExact(items[0])) {
9150 res = items[0];
9151 Py_INCREF(res);
9152 Py_DECREF(fseq);
9153 return res;
9154 }
9155 sep = NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009156 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009157 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009158 /* Set up sep and seplen */
9159 if (separator == NULL) {
9160 /* fall back to a blank space separator */
9161 sep = PyUnicode_FromOrdinal(' ');
9162 if (!sep)
9163 goto onError;
9164 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009165 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009166 else {
9167 if (!PyUnicode_Check(separator)) {
9168 PyErr_Format(PyExc_TypeError,
9169 "separator: expected str instance,"
9170 " %.80s found",
9171 Py_TYPE(separator)->tp_name);
9172 goto onError;
9173 }
9174 if (PyUnicode_READY(separator))
9175 goto onError;
9176 sep = separator;
9177 seplen = PyUnicode_GET_LENGTH(separator);
9178 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9179 /* inc refcount to keep this code path symmetric with the
9180 above case of a blank separator */
9181 Py_INCREF(sep);
9182 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009183 }
9184
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009185 /* There are at least two things to join, or else we have a subclass
9186 * of str in the sequence.
9187 * Do a pre-pass to figure out the total amount of space we'll
9188 * need (sz), and see whether all argument are strings.
9189 */
9190 sz = 0;
9191 for (i = 0; i < seqlen; i++) {
9192 const Py_ssize_t old_sz = sz;
9193 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009194 if (!PyUnicode_Check(item)) {
9195 PyErr_Format(PyExc_TypeError,
9196 "sequence item %zd: expected str instance,"
9197 " %.80s found",
9198 i, Py_TYPE(item)->tp_name);
9199 goto onError;
9200 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009201 if (PyUnicode_READY(item) == -1)
9202 goto onError;
9203 sz += PyUnicode_GET_LENGTH(item);
9204 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
9205 if (item_maxchar > maxchar)
9206 maxchar = item_maxchar;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009207 if (i != 0)
9208 sz += seplen;
9209 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9210 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009211 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009212 goto onError;
9213 }
9214 }
Tim Petersced69f82003-09-16 20:30:58 +00009215
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009216 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009217 if (res == NULL)
9218 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009219
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009220 /* Catenate everything. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009221 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009222 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009223 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009224 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009225 if (i && seplen != 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009226 copy_characters(res, res_offset, sep, 0, seplen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009227 res_offset += seplen;
Benjamin Peterson29060642009-01-31 22:14:21 +00009228 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009229 itemlen = PyUnicode_GET_LENGTH(item);
9230 if (itemlen != 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009231 copy_characters(res, res_offset, item, 0, itemlen);
Victor Stinner9ce5a832011-10-03 23:36:02 +02009232 res_offset += itemlen;
Victor Stinner9ce5a832011-10-03 23:36:02 +02009233 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009234 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009235 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009236
Tim Peters05eba1f2004-08-27 21:32:02 +00009237 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009238 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009239 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009240 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009241
Benjamin Peterson29060642009-01-31 22:14:21 +00009242 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009243 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009244 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009245 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009246 return NULL;
9247}
9248
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009249#define FILL(kind, data, value, start, length) \
9250 do { \
9251 Py_ssize_t i_ = 0; \
9252 assert(kind != PyUnicode_WCHAR_KIND); \
9253 switch ((kind)) { \
9254 case PyUnicode_1BYTE_KIND: { \
9255 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9256 memset(to_, (unsigned char)value, length); \
9257 break; \
9258 } \
9259 case PyUnicode_2BYTE_KIND: { \
9260 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9261 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9262 break; \
9263 } \
9264 default: { \
9265 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9266 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9267 break; \
9268 } \
9269 } \
9270 } while (0)
9271
Victor Stinner9310abb2011-10-05 00:59:23 +02009272static PyObject *
9273pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009274 Py_ssize_t left,
9275 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009276 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009277{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009278 PyObject *u;
9279 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009280 int kind;
9281 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009282
9283 if (left < 0)
9284 left = 0;
9285 if (right < 0)
9286 right = 0;
9287
Tim Peters7a29bd52001-09-12 03:03:31 +00009288 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009289 Py_INCREF(self);
9290 return self;
9291 }
9292
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009293 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9294 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009295 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9296 return NULL;
9297 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009298 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9299 if (fill > maxchar)
9300 maxchar = fill;
9301 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009302 if (!u)
9303 return NULL;
9304
9305 kind = PyUnicode_KIND(u);
9306 data = PyUnicode_DATA(u);
9307 if (left)
9308 FILL(kind, data, fill, 0, left);
9309 if (right)
9310 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009311 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009312 assert(_PyUnicode_CheckConsistency(u, 1));
9313 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009314}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009315#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009316
Alexander Belopolsky40018472011-02-26 01:02:56 +00009317PyObject *
9318PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009319{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009320 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009321
9322 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009323 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009324 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009325
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009326 switch(PyUnicode_KIND(string)) {
9327 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009328 if (PyUnicode_IS_ASCII(string))
9329 list = asciilib_splitlines(
9330 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9331 PyUnicode_GET_LENGTH(string), keepends);
9332 else
9333 list = ucs1lib_splitlines(
9334 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9335 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009336 break;
9337 case PyUnicode_2BYTE_KIND:
9338 list = ucs2lib_splitlines(
9339 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
9340 PyUnicode_GET_LENGTH(string), keepends);
9341 break;
9342 case PyUnicode_4BYTE_KIND:
9343 list = ucs4lib_splitlines(
9344 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
9345 PyUnicode_GET_LENGTH(string), keepends);
9346 break;
9347 default:
9348 assert(0);
9349 list = 0;
9350 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009351 Py_DECREF(string);
9352 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009353}
9354
Alexander Belopolsky40018472011-02-26 01:02:56 +00009355static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009356split(PyObject *self,
9357 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009358 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009359{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009360 int kind1, kind2, kind;
9361 void *buf1, *buf2;
9362 Py_ssize_t len1, len2;
9363 PyObject* out;
9364
Guido van Rossumd57fd912000-03-10 22:53:23 +00009365 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009366 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009367
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009368 if (PyUnicode_READY(self) == -1)
9369 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009370
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009371 if (substring == NULL)
9372 switch(PyUnicode_KIND(self)) {
9373 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009374 if (PyUnicode_IS_ASCII(self))
9375 return asciilib_split_whitespace(
9376 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9377 PyUnicode_GET_LENGTH(self), maxcount
9378 );
9379 else
9380 return ucs1lib_split_whitespace(
9381 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9382 PyUnicode_GET_LENGTH(self), maxcount
9383 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009384 case PyUnicode_2BYTE_KIND:
9385 return ucs2lib_split_whitespace(
9386 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9387 PyUnicode_GET_LENGTH(self), maxcount
9388 );
9389 case PyUnicode_4BYTE_KIND:
9390 return ucs4lib_split_whitespace(
9391 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9392 PyUnicode_GET_LENGTH(self), maxcount
9393 );
9394 default:
9395 assert(0);
9396 return NULL;
9397 }
9398
9399 if (PyUnicode_READY(substring) == -1)
9400 return NULL;
9401
9402 kind1 = PyUnicode_KIND(self);
9403 kind2 = PyUnicode_KIND(substring);
9404 kind = kind1 > kind2 ? kind1 : kind2;
9405 buf1 = PyUnicode_DATA(self);
9406 buf2 = PyUnicode_DATA(substring);
9407 if (kind1 != kind)
9408 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9409 if (!buf1)
9410 return NULL;
9411 if (kind2 != kind)
9412 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9413 if (!buf2) {
9414 if (kind1 != kind) PyMem_Free(buf1);
9415 return NULL;
9416 }
9417 len1 = PyUnicode_GET_LENGTH(self);
9418 len2 = PyUnicode_GET_LENGTH(substring);
9419
9420 switch(kind) {
9421 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009422 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9423 out = asciilib_split(
9424 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9425 else
9426 out = ucs1lib_split(
9427 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009428 break;
9429 case PyUnicode_2BYTE_KIND:
9430 out = ucs2lib_split(
9431 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9432 break;
9433 case PyUnicode_4BYTE_KIND:
9434 out = ucs4lib_split(
9435 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9436 break;
9437 default:
9438 out = NULL;
9439 }
9440 if (kind1 != kind)
9441 PyMem_Free(buf1);
9442 if (kind2 != kind)
9443 PyMem_Free(buf2);
9444 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009445}
9446
Alexander Belopolsky40018472011-02-26 01:02:56 +00009447static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009448rsplit(PyObject *self,
9449 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009450 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009451{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009452 int kind1, kind2, kind;
9453 void *buf1, *buf2;
9454 Py_ssize_t len1, len2;
9455 PyObject* out;
9456
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009457 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009458 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009459
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009460 if (PyUnicode_READY(self) == -1)
9461 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009462
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009463 if (substring == NULL)
9464 switch(PyUnicode_KIND(self)) {
9465 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009466 if (PyUnicode_IS_ASCII(self))
9467 return asciilib_rsplit_whitespace(
9468 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9469 PyUnicode_GET_LENGTH(self), maxcount
9470 );
9471 else
9472 return ucs1lib_rsplit_whitespace(
9473 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9474 PyUnicode_GET_LENGTH(self), maxcount
9475 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009476 case PyUnicode_2BYTE_KIND:
9477 return ucs2lib_rsplit_whitespace(
9478 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9479 PyUnicode_GET_LENGTH(self), maxcount
9480 );
9481 case PyUnicode_4BYTE_KIND:
9482 return ucs4lib_rsplit_whitespace(
9483 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9484 PyUnicode_GET_LENGTH(self), maxcount
9485 );
9486 default:
9487 assert(0);
9488 return NULL;
9489 }
9490
9491 if (PyUnicode_READY(substring) == -1)
9492 return NULL;
9493
9494 kind1 = PyUnicode_KIND(self);
9495 kind2 = PyUnicode_KIND(substring);
9496 kind = kind1 > kind2 ? kind1 : kind2;
9497 buf1 = PyUnicode_DATA(self);
9498 buf2 = PyUnicode_DATA(substring);
9499 if (kind1 != kind)
9500 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9501 if (!buf1)
9502 return NULL;
9503 if (kind2 != kind)
9504 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9505 if (!buf2) {
9506 if (kind1 != kind) PyMem_Free(buf1);
9507 return NULL;
9508 }
9509 len1 = PyUnicode_GET_LENGTH(self);
9510 len2 = PyUnicode_GET_LENGTH(substring);
9511
9512 switch(kind) {
9513 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009514 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9515 out = asciilib_rsplit(
9516 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9517 else
9518 out = ucs1lib_rsplit(
9519 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009520 break;
9521 case PyUnicode_2BYTE_KIND:
9522 out = ucs2lib_rsplit(
9523 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9524 break;
9525 case PyUnicode_4BYTE_KIND:
9526 out = ucs4lib_rsplit(
9527 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9528 break;
9529 default:
9530 out = NULL;
9531 }
9532 if (kind1 != kind)
9533 PyMem_Free(buf1);
9534 if (kind2 != kind)
9535 PyMem_Free(buf2);
9536 return out;
9537}
9538
9539static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009540anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9541 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009542{
9543 switch(kind) {
9544 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009545 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9546 return asciilib_find(buf1, len1, buf2, len2, offset);
9547 else
9548 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009549 case PyUnicode_2BYTE_KIND:
9550 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9551 case PyUnicode_4BYTE_KIND:
9552 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9553 }
9554 assert(0);
9555 return -1;
9556}
9557
9558static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009559anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
9560 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009561{
9562 switch(kind) {
9563 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009564 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
9565 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
9566 else
9567 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009568 case PyUnicode_2BYTE_KIND:
9569 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9570 case PyUnicode_4BYTE_KIND:
9571 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9572 }
9573 assert(0);
9574 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009575}
9576
Alexander Belopolsky40018472011-02-26 01:02:56 +00009577static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009578replace(PyObject *self, PyObject *str1,
9579 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009580{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009581 PyObject *u;
9582 char *sbuf = PyUnicode_DATA(self);
9583 char *buf1 = PyUnicode_DATA(str1);
9584 char *buf2 = PyUnicode_DATA(str2);
9585 int srelease = 0, release1 = 0, release2 = 0;
9586 int skind = PyUnicode_KIND(self);
9587 int kind1 = PyUnicode_KIND(str1);
9588 int kind2 = PyUnicode_KIND(str2);
9589 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9590 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9591 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009592
9593 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009594 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009595 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009596 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009597
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009598 if (skind < kind1)
9599 /* substring too wide to be present */
9600 goto nothing;
9601
9602 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00009603 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009604 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009605 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009606 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009607 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009608 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009609 Py_UCS4 u1, u2, maxchar;
9610 int mayshrink, rkind;
9611 u1 = PyUnicode_READ_CHAR(str1, 0);
9612 if (!findchar(sbuf, PyUnicode_KIND(self),
9613 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00009614 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009615 u2 = PyUnicode_READ_CHAR(str2, 0);
9616 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9617 /* Replacing u1 with u2 may cause a maxchar reduction in the
9618 result string. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009619 if (u2 > maxchar) {
9620 maxchar = u2;
9621 mayshrink = 0;
9622 }
Victor Stinnerb9275c12011-10-05 14:01:42 +02009623 else
9624 mayshrink = maxchar > 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009625 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009626 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009627 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009628 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009629 rkind = PyUnicode_KIND(u);
9630 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
9631 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009632 if (--maxcount < 0)
9633 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009634 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009635 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009636 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +02009637 unicode_adjust_maxchar(&u);
9638 if (u == NULL)
9639 goto error;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009640 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009641 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009642 int rkind = skind;
9643 char *res;
Victor Stinner25a4b292011-10-06 12:31:55 +02009644 PyObject *rstr;
9645 Py_UCS4 maxchar;
9646
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009647 if (kind1 < rkind) {
9648 /* widen substring */
9649 buf1 = _PyUnicode_AsKind(str1, rkind);
9650 if (!buf1) goto error;
9651 release1 = 1;
9652 }
Victor Stinnerc3cec782011-10-05 21:24:08 +02009653 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009654 if (i < 0)
9655 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009656 if (rkind > kind2) {
9657 /* widen replacement */
9658 buf2 = _PyUnicode_AsKind(str2, rkind);
9659 if (!buf2) goto error;
9660 release2 = 1;
9661 }
9662 else if (rkind < kind2) {
9663 /* widen self and buf1 */
9664 rkind = kind2;
9665 if (release1) PyMem_Free(buf1);
9666 sbuf = _PyUnicode_AsKind(self, rkind);
9667 if (!sbuf) goto error;
9668 srelease = 1;
9669 buf1 = _PyUnicode_AsKind(str1, rkind);
9670 if (!buf1) goto error;
9671 release1 = 1;
9672 }
Victor Stinner25a4b292011-10-06 12:31:55 +02009673 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9674 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(str2));
9675 rstr = PyUnicode_New(slen, maxchar);
9676 if (!rstr)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009677 goto error;
Victor Stinner25a4b292011-10-06 12:31:55 +02009678 res = PyUnicode_DATA(rstr);
9679
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009680 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009681 /* change everything in-place, starting with this one */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009682 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9683 buf2,
9684 PyUnicode_KIND_SIZE(rkind, len2));
9685 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009686
9687 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +02009688 i = anylib_find(rkind, self,
9689 sbuf+PyUnicode_KIND_SIZE(rkind, i), slen-i,
9690 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009691 if (i == -1)
9692 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009693 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9694 buf2,
9695 PyUnicode_KIND_SIZE(rkind, len2));
9696 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009697 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009698
Victor Stinner25a4b292011-10-06 12:31:55 +02009699 u = rstr;
9700 unicode_adjust_maxchar(&u);
9701 if (!u)
9702 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009703 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009704 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009705
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009706 Py_ssize_t n, i, j, ires;
9707 Py_ssize_t product, new_size;
9708 int rkind = skind;
Victor Stinner25a4b292011-10-06 12:31:55 +02009709 PyObject *rstr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009710 char *res;
Victor Stinner25a4b292011-10-06 12:31:55 +02009711 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009712
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009713 if (kind1 < rkind) {
9714 buf1 = _PyUnicode_AsKind(str1, rkind);
9715 if (!buf1) goto error;
9716 release1 = 1;
9717 }
Victor Stinnerc3cec782011-10-05 21:24:08 +02009718 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009719 if (n == 0)
9720 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009721 if (kind2 < rkind) {
9722 buf2 = _PyUnicode_AsKind(str2, rkind);
9723 if (!buf2) goto error;
9724 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009725 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009726 else if (kind2 > rkind) {
9727 rkind = kind2;
9728 sbuf = _PyUnicode_AsKind(self, rkind);
9729 if (!sbuf) goto error;
9730 srelease = 1;
9731 if (release1) PyMem_Free(buf1);
9732 buf1 = _PyUnicode_AsKind(str1, rkind);
9733 if (!buf1) goto error;
9734 release1 = 1;
9735 }
9736 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9737 PyUnicode_GET_LENGTH(str1))); */
9738 product = n * (len2-len1);
9739 if ((product / (len2-len1)) != n) {
9740 PyErr_SetString(PyExc_OverflowError,
9741 "replace string is too long");
9742 goto error;
9743 }
9744 new_size = slen + product;
9745 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9746 PyErr_SetString(PyExc_OverflowError,
9747 "replace string is too long");
9748 goto error;
9749 }
Victor Stinner25a4b292011-10-06 12:31:55 +02009750 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9751 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(str2));
9752 rstr = PyUnicode_New(new_size, maxchar);
9753 if (!rstr)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009754 goto error;
Victor Stinner25a4b292011-10-06 12:31:55 +02009755 res = PyUnicode_DATA(rstr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009756 ires = i = 0;
9757 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009758 while (n-- > 0) {
9759 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +02009760 j = anylib_find(rkind, self,
9761 sbuf + PyUnicode_KIND_SIZE(rkind, i), slen-i,
9762 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009763 if (j == -1)
9764 break;
9765 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009766 /* copy unchanged part [i:j] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009767 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9768 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9769 PyUnicode_KIND_SIZE(rkind, j-i));
9770 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009771 }
9772 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009773 if (len2 > 0) {
9774 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9775 buf2,
9776 PyUnicode_KIND_SIZE(rkind, len2));
9777 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009778 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009779 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009780 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009781 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009782 /* copy tail [i:] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009783 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9784 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9785 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009786 } else {
9787 /* interleave */
9788 while (n > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009789 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9790 buf2,
9791 PyUnicode_KIND_SIZE(rkind, len2));
9792 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009793 if (--n <= 0)
9794 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009795 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9796 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9797 PyUnicode_KIND_SIZE(rkind, 1));
9798 ires++;
9799 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009800 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009801 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9802 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9803 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009804 }
Victor Stinner25a4b292011-10-06 12:31:55 +02009805 u = rstr;
9806 unicode_adjust_maxchar(&u);
9807 if (u == NULL)
9808 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009809 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009810 if (srelease)
9811 PyMem_FREE(sbuf);
9812 if (release1)
9813 PyMem_FREE(buf1);
9814 if (release2)
9815 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009816 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009817 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009818
Benjamin Peterson29060642009-01-31 22:14:21 +00009819 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009820 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009821 if (srelease)
9822 PyMem_FREE(sbuf);
9823 if (release1)
9824 PyMem_FREE(buf1);
9825 if (release2)
9826 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009827 if (PyUnicode_CheckExact(self)) {
9828 Py_INCREF(self);
9829 return (PyObject *) self;
9830 }
Victor Stinner034f6cf2011-09-30 02:26:44 +02009831 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009832 error:
9833 if (srelease && sbuf)
9834 PyMem_FREE(sbuf);
9835 if (release1 && buf1)
9836 PyMem_FREE(buf1);
9837 if (release2 && buf2)
9838 PyMem_FREE(buf2);
9839 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009840}
9841
9842/* --- Unicode Object Methods --------------------------------------------- */
9843
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009844PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009845 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009846\n\
9847Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009848characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009849
9850static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +02009851unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009852{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009853 return fixup(self, fixtitle);
9854}
9855
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009856PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009857 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009858\n\
9859Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009860have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009861
9862static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +02009863unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009864{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009865 return fixup(self, fixcapitalize);
9866}
9867
9868#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009869PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009870 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009871\n\
9872Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009873normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009874
9875static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009876unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009877{
9878 PyObject *list;
9879 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009880 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009881
Guido van Rossumd57fd912000-03-10 22:53:23 +00009882 /* Split into words */
9883 list = split(self, NULL, -1);
9884 if (!list)
9885 return NULL;
9886
9887 /* Capitalize each word */
9888 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9889 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009890 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009891 if (item == NULL)
9892 goto onError;
9893 Py_DECREF(PyList_GET_ITEM(list, i));
9894 PyList_SET_ITEM(list, i, item);
9895 }
9896
9897 /* Join the words to form a new string */
9898 item = PyUnicode_Join(NULL, list);
9899
Benjamin Peterson29060642009-01-31 22:14:21 +00009900 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009901 Py_DECREF(list);
9902 return (PyObject *)item;
9903}
9904#endif
9905
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009906/* Argument converter. Coerces to a single unicode character */
9907
9908static int
9909convert_uc(PyObject *obj, void *addr)
9910{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009911 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009912 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009913
Benjamin Peterson14339b62009-01-31 16:36:08 +00009914 uniobj = PyUnicode_FromObject(obj);
9915 if (uniobj == NULL) {
9916 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009917 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009918 return 0;
9919 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009920 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009921 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009922 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009923 Py_DECREF(uniobj);
9924 return 0;
9925 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009926 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009927 Py_DECREF(uniobj);
9928 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009929}
9930
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009931PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009932 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009933\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009934Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009935done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009936
9937static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009938unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009939{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009940 Py_ssize_t marg, left;
9941 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009942 Py_UCS4 fillchar = ' ';
9943
Victor Stinnere9a29352011-10-01 02:14:59 +02009944 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009945 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009946
Victor Stinnere9a29352011-10-01 02:14:59 +02009947 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009948 return NULL;
9949
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009950 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009951 Py_INCREF(self);
9952 return (PyObject*) self;
9953 }
9954
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009955 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009956 left = marg / 2 + (marg & width & 1);
9957
Victor Stinner9310abb2011-10-05 00:59:23 +02009958 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009959}
9960
Marc-André Lemburge5034372000-08-08 08:04:29 +00009961#if 0
9962
9963/* This code should go into some future Unicode collation support
9964 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00009965 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00009966
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009967/* speedy UTF-16 code point order comparison */
9968/* gleaned from: */
9969/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9970
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009971static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009972{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009973 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00009974 0, 0, 0, 0, 0, 0, 0, 0,
9975 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009976 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009977};
9978
Guido van Rossumd57fd912000-03-10 22:53:23 +00009979static int
9980unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9981{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009982 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009983
Guido van Rossumd57fd912000-03-10 22:53:23 +00009984 Py_UNICODE *s1 = str1->str;
9985 Py_UNICODE *s2 = str2->str;
9986
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009987 len1 = str1->_base._base.length;
9988 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +00009989
Guido van Rossumd57fd912000-03-10 22:53:23 +00009990 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00009991 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009992
9993 c1 = *s1++;
9994 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00009995
Benjamin Peterson29060642009-01-31 22:14:21 +00009996 if (c1 > (1<<11) * 26)
9997 c1 += utf16Fixup[c1>>11];
9998 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009999 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +000010000 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +000010001
10002 if (c1 != c2)
10003 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +000010004
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +000010005 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010006 }
10007
10008 return (len1 < len2) ? -1 : (len1 != len2);
10009}
10010
Marc-André Lemburge5034372000-08-08 08:04:29 +000010011#else
10012
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010013/* This function assumes that str1 and str2 are readied by the caller. */
10014
Marc-André Lemburge5034372000-08-08 08:04:29 +000010015static int
10016unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
10017{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010018 int kind1, kind2;
10019 void *data1, *data2;
10020 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010021
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010022 kind1 = PyUnicode_KIND(str1);
10023 kind2 = PyUnicode_KIND(str2);
10024 data1 = PyUnicode_DATA(str1);
10025 data2 = PyUnicode_DATA(str2);
10026 len1 = PyUnicode_GET_LENGTH(str1);
10027 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010028
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010029 for (i = 0; i < len1 && i < len2; ++i) {
10030 Py_UCS4 c1, c2;
10031 c1 = PyUnicode_READ(kind1, data1, i);
10032 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010033
10034 if (c1 != c2)
10035 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010036 }
10037
10038 return (len1 < len2) ? -1 : (len1 != len2);
10039}
10040
10041#endif
10042
Alexander Belopolsky40018472011-02-26 01:02:56 +000010043int
10044PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010045{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010046 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10047 if (PyUnicode_READY(left) == -1 ||
10048 PyUnicode_READY(right) == -1)
10049 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010050 return unicode_compare((PyUnicodeObject *)left,
10051 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010052 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010053 PyErr_Format(PyExc_TypeError,
10054 "Can't compare %.100s and %.100s",
10055 left->ob_type->tp_name,
10056 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010057 return -1;
10058}
10059
Martin v. Löwis5b222132007-06-10 09:51:05 +000010060int
10061PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10062{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010063 Py_ssize_t i;
10064 int kind;
10065 void *data;
10066 Py_UCS4 chr;
10067
Victor Stinner910337b2011-10-03 03:20:16 +020010068 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010069 if (PyUnicode_READY(uni) == -1)
10070 return -1;
10071 kind = PyUnicode_KIND(uni);
10072 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010073 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010074 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10075 if (chr != str[i])
10076 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010077 /* This check keeps Python strings that end in '\0' from comparing equal
10078 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010079 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010080 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010081 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010082 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010083 return 0;
10084}
10085
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010086
Benjamin Peterson29060642009-01-31 22:14:21 +000010087#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010088 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010089
Alexander Belopolsky40018472011-02-26 01:02:56 +000010090PyObject *
10091PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010092{
10093 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010094
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010095 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10096 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010097 if (PyUnicode_READY(left) == -1 ||
10098 PyUnicode_READY(right) == -1)
10099 return NULL;
10100 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10101 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010102 if (op == Py_EQ) {
10103 Py_INCREF(Py_False);
10104 return Py_False;
10105 }
10106 if (op == Py_NE) {
10107 Py_INCREF(Py_True);
10108 return Py_True;
10109 }
10110 }
10111 if (left == right)
10112 result = 0;
10113 else
10114 result = unicode_compare((PyUnicodeObject *)left,
10115 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010116
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010117 /* Convert the return value to a Boolean */
10118 switch (op) {
10119 case Py_EQ:
10120 v = TEST_COND(result == 0);
10121 break;
10122 case Py_NE:
10123 v = TEST_COND(result != 0);
10124 break;
10125 case Py_LE:
10126 v = TEST_COND(result <= 0);
10127 break;
10128 case Py_GE:
10129 v = TEST_COND(result >= 0);
10130 break;
10131 case Py_LT:
10132 v = TEST_COND(result == -1);
10133 break;
10134 case Py_GT:
10135 v = TEST_COND(result == 1);
10136 break;
10137 default:
10138 PyErr_BadArgument();
10139 return NULL;
10140 }
10141 Py_INCREF(v);
10142 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010143 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010144
Brian Curtindfc80e32011-08-10 20:28:54 -050010145 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010146}
10147
Alexander Belopolsky40018472011-02-26 01:02:56 +000010148int
10149PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010150{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010151 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010152 int kind1, kind2, kind;
10153 void *buf1, *buf2;
10154 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010155 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010156
10157 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010158 sub = PyUnicode_FromObject(element);
10159 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010160 PyErr_Format(PyExc_TypeError,
10161 "'in <string>' requires string as left operand, not %s",
10162 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010163 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010164 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010165 if (PyUnicode_READY(sub) == -1)
10166 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010167
Thomas Wouters477c8d52006-05-27 19:21:47 +000010168 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +020010169 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010170 Py_DECREF(sub);
10171 return -1;
10172 }
10173
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010174 kind1 = PyUnicode_KIND(str);
10175 kind2 = PyUnicode_KIND(sub);
10176 kind = kind1 > kind2 ? kind1 : kind2;
10177 buf1 = PyUnicode_DATA(str);
10178 buf2 = PyUnicode_DATA(sub);
10179 if (kind1 != kind)
10180 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
10181 if (!buf1) {
10182 Py_DECREF(sub);
10183 return -1;
10184 }
10185 if (kind2 != kind)
10186 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
10187 if (!buf2) {
10188 Py_DECREF(sub);
10189 if (kind1 != kind) PyMem_Free(buf1);
10190 return -1;
10191 }
10192 len1 = PyUnicode_GET_LENGTH(str);
10193 len2 = PyUnicode_GET_LENGTH(sub);
10194
10195 switch(kind) {
10196 case PyUnicode_1BYTE_KIND:
10197 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10198 break;
10199 case PyUnicode_2BYTE_KIND:
10200 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10201 break;
10202 case PyUnicode_4BYTE_KIND:
10203 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10204 break;
10205 default:
10206 result = -1;
10207 assert(0);
10208 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010209
10210 Py_DECREF(str);
10211 Py_DECREF(sub);
10212
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010213 if (kind1 != kind)
10214 PyMem_Free(buf1);
10215 if (kind2 != kind)
10216 PyMem_Free(buf2);
10217
Guido van Rossum403d68b2000-03-13 15:55:09 +000010218 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010219}
10220
Guido van Rossumd57fd912000-03-10 22:53:23 +000010221/* Concat to string or Unicode object giving a new Unicode object. */
10222
Alexander Belopolsky40018472011-02-26 01:02:56 +000010223PyObject *
10224PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010225{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010226 PyObject *u = NULL, *v = NULL, *w;
10227 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010228
10229 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010230 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010231 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010232 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010233 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010234 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010235 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010236
10237 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010238 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010239 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010240 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010241 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010242 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010243 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010244 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010245 }
10246
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010247 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +020010248 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010249
Guido van Rossumd57fd912000-03-10 22:53:23 +000010250 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010251 w = PyUnicode_New(
10252 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
10253 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010254 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010255 goto onError;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010256 copy_characters(w, 0, u, 0, PyUnicode_GET_LENGTH(u));
10257 copy_characters(w, PyUnicode_GET_LENGTH(u), v, 0, PyUnicode_GET_LENGTH(v));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010258 Py_DECREF(u);
10259 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010260 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010261 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010262
Benjamin Peterson29060642009-01-31 22:14:21 +000010263 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010264 Py_XDECREF(u);
10265 Py_XDECREF(v);
10266 return NULL;
10267}
10268
Victor Stinnerb0923652011-10-04 01:17:31 +020010269static void
10270unicode_append_inplace(PyObject **p_left, PyObject *right)
10271{
10272 Py_ssize_t left_len, right_len, new_len;
Victor Stinnerb0923652011-10-04 01:17:31 +020010273
10274 assert(PyUnicode_IS_READY(*p_left));
10275 assert(PyUnicode_IS_READY(right));
10276
10277 left_len = PyUnicode_GET_LENGTH(*p_left);
10278 right_len = PyUnicode_GET_LENGTH(right);
10279 if (left_len > PY_SSIZE_T_MAX - right_len) {
10280 PyErr_SetString(PyExc_OverflowError,
10281 "strings are too large to concat");
10282 goto error;
10283 }
10284 new_len = left_len + right_len;
10285
10286 /* Now we own the last reference to 'left', so we can resize it
10287 * in-place.
10288 */
10289 if (unicode_resize(p_left, new_len) != 0) {
10290 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10291 * deallocated so it cannot be put back into
10292 * 'variable'. The MemoryError is raised when there
10293 * is no value in 'variable', which might (very
10294 * remotely) be a cause of incompatibilities.
10295 */
10296 goto error;
10297 }
10298 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010299 copy_characters(*p_left, left_len, right, 0, right_len);
10300 _PyUnicode_DIRTY(*p_left);
Victor Stinnerb0923652011-10-04 01:17:31 +020010301 return;
10302
10303error:
10304 Py_DECREF(*p_left);
10305 *p_left = NULL;
10306}
10307
Walter Dörwald1ab83302007-05-18 17:15:44 +000010308void
Victor Stinner23e56682011-10-03 03:54:37 +020010309PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010310{
Victor Stinner23e56682011-10-03 03:54:37 +020010311 PyObject *left, *res;
10312
10313 if (p_left == NULL) {
10314 if (!PyErr_Occurred())
10315 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010316 return;
10317 }
Victor Stinner23e56682011-10-03 03:54:37 +020010318 left = *p_left;
10319 if (right == NULL || !PyUnicode_Check(left)) {
10320 if (!PyErr_Occurred())
10321 PyErr_BadInternalCall();
10322 goto error;
10323 }
10324
Victor Stinnere1335c72011-10-04 20:53:03 +020010325 if (PyUnicode_READY(left))
10326 goto error;
10327 if (PyUnicode_READY(right))
10328 goto error;
10329
Victor Stinner23e56682011-10-03 03:54:37 +020010330 if (PyUnicode_CheckExact(left) && left != unicode_empty
10331 && PyUnicode_CheckExact(right) && right != unicode_empty
10332 && unicode_resizable(left)
10333 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
10334 || _PyUnicode_WSTR(left) != NULL))
10335 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010336 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10337 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010338 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010339 not so different than duplicating the string. */
10340 if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
Victor Stinner23e56682011-10-03 03:54:37 +020010341 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010342 unicode_append_inplace(p_left, right);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010343 if (p_left != NULL)
10344 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010345 return;
10346 }
10347 }
10348
10349 res = PyUnicode_Concat(left, right);
10350 if (res == NULL)
10351 goto error;
10352 Py_DECREF(left);
10353 *p_left = res;
10354 return;
10355
10356error:
10357 Py_DECREF(*p_left);
10358 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +000010359}
10360
10361void
10362PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10363{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010364 PyUnicode_Append(pleft, right);
10365 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010366}
10367
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010368PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010369 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010370\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010371Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010372string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010373interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010374
10375static PyObject *
10376unicode_count(PyUnicodeObject *self, PyObject *args)
10377{
10378 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010379 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010380 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010381 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010382 int kind1, kind2, kind;
10383 void *buf1, *buf2;
10384 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010385
Jesus Ceaac451502011-04-20 17:09:23 +020010386 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10387 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010388 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010389
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010390 kind1 = PyUnicode_KIND(self);
10391 kind2 = PyUnicode_KIND(substring);
10392 kind = kind1 > kind2 ? kind1 : kind2;
10393 buf1 = PyUnicode_DATA(self);
10394 buf2 = PyUnicode_DATA(substring);
10395 if (kind1 != kind)
10396 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
10397 if (!buf1) {
10398 Py_DECREF(substring);
10399 return NULL;
10400 }
10401 if (kind2 != kind)
10402 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
10403 if (!buf2) {
10404 Py_DECREF(substring);
10405 if (kind1 != kind) PyMem_Free(buf1);
10406 return NULL;
10407 }
10408 len1 = PyUnicode_GET_LENGTH(self);
10409 len2 = PyUnicode_GET_LENGTH(substring);
10410
10411 ADJUST_INDICES(start, end, len1);
10412 switch(kind) {
10413 case PyUnicode_1BYTE_KIND:
10414 iresult = ucs1lib_count(
10415 ((Py_UCS1*)buf1) + start, end - start,
10416 buf2, len2, PY_SSIZE_T_MAX
10417 );
10418 break;
10419 case PyUnicode_2BYTE_KIND:
10420 iresult = ucs2lib_count(
10421 ((Py_UCS2*)buf1) + start, end - start,
10422 buf2, len2, PY_SSIZE_T_MAX
10423 );
10424 break;
10425 case PyUnicode_4BYTE_KIND:
10426 iresult = ucs4lib_count(
10427 ((Py_UCS4*)buf1) + start, end - start,
10428 buf2, len2, PY_SSIZE_T_MAX
10429 );
10430 break;
10431 default:
10432 assert(0); iresult = 0;
10433 }
10434
10435 result = PyLong_FromSsize_t(iresult);
10436
10437 if (kind1 != kind)
10438 PyMem_Free(buf1);
10439 if (kind2 != kind)
10440 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010441
10442 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010443
Guido van Rossumd57fd912000-03-10 22:53:23 +000010444 return result;
10445}
10446
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010447PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010448 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010449\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010450Encode S using the codec registered for encoding. Default encoding\n\
10451is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010452handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010453a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10454'xmlcharrefreplace' as well as any other name registered with\n\
10455codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010456
10457static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +000010458unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010459{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010460 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010461 char *encoding = NULL;
10462 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010463
Benjamin Peterson308d6372009-09-18 21:42:35 +000010464 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10465 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010466 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +000010467 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010468}
10469
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010470PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010471 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010472\n\
10473Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010474If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010475
10476static PyObject*
10477unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
10478{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010479 Py_ssize_t i, j, line_pos, src_len, incr;
10480 Py_UCS4 ch;
10481 PyObject *u;
10482 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010483 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010484 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010485 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010486
10487 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010488 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010489
Antoine Pitrou22425222011-10-04 19:10:51 +020010490 if (PyUnicode_READY(self) == -1)
10491 return NULL;
10492
Thomas Wouters7e474022000-07-16 12:04:32 +000010493 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010494 src_len = PyUnicode_GET_LENGTH(self);
10495 i = j = line_pos = 0;
10496 kind = PyUnicode_KIND(self);
10497 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010498 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010499 for (; i < src_len; i++) {
10500 ch = PyUnicode_READ(kind, src_data, i);
10501 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010502 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010503 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010504 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010505 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010506 goto overflow;
10507 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010508 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010509 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010510 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010511 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010512 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010513 goto overflow;
10514 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010515 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010516 if (ch == '\n' || ch == '\r')
10517 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010518 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010519 }
Antoine Pitroue19aa382011-10-04 16:04:01 +020010520 if (!found && PyUnicode_CheckExact(self)) {
10521 Py_INCREF((PyObject *) self);
10522 return (PyObject *) self;
10523 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010524
Guido van Rossumd57fd912000-03-10 22:53:23 +000010525 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010526 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010527 if (!u)
10528 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010529 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010530
Antoine Pitroue71d5742011-10-04 15:55:09 +020010531 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010532
Antoine Pitroue71d5742011-10-04 15:55:09 +020010533 for (; i < src_len; i++) {
10534 ch = PyUnicode_READ(kind, src_data, i);
10535 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010536 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010537 incr = tabsize - (line_pos % tabsize);
10538 line_pos += incr;
10539 while (incr--) {
10540 PyUnicode_WRITE(kind, dest_data, j, ' ');
10541 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010542 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010543 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010544 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010545 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010546 line_pos++;
10547 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010548 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010549 if (ch == '\n' || ch == '\r')
10550 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010551 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010552 }
10553 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinner17efeed2011-10-04 20:05:46 +020010554#ifndef DONT_MAKE_RESULT_READY
10555 if (_PyUnicode_READY_REPLACE(&u)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010556 Py_DECREF(u);
10557 return NULL;
10558 }
Victor Stinner17efeed2011-10-04 20:05:46 +020010559#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010560 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010561 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010562
Antoine Pitroue71d5742011-10-04 15:55:09 +020010563 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010564 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10565 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010566}
10567
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010568PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010569 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010570\n\
10571Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010572such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010573arguments start and end are interpreted as in slice notation.\n\
10574\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010575Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010576
10577static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010578unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010579{
Jesus Ceaac451502011-04-20 17:09:23 +020010580 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010581 Py_ssize_t start;
10582 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010583 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010584
Jesus Ceaac451502011-04-20 17:09:23 +020010585 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10586 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010587 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010588
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010589 if (PyUnicode_READY(self) == -1)
10590 return NULL;
10591 if (PyUnicode_READY(substring) == -1)
10592 return NULL;
10593
10594 result = any_find_slice(
Victor Stinnerc3cec782011-10-05 21:24:08 +020010595 asciilib_find_slice, ucs1lib_find_slice,
10596 ucs2lib_find_slice, ucs4lib_find_slice,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010597 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010598 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010599
10600 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010601
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010602 if (result == -2)
10603 return NULL;
10604
Christian Heimes217cfd12007-12-02 14:31:20 +000010605 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010606}
10607
10608static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010609unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010610{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010611 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10612 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010613 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010614 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010615}
10616
Guido van Rossumc2504932007-09-18 19:42:40 +000010617/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010618 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010619static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +000010620unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010621{
Guido van Rossumc2504932007-09-18 19:42:40 +000010622 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010623 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010624
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010625 if (_PyUnicode_HASH(self) != -1)
10626 return _PyUnicode_HASH(self);
10627 if (PyUnicode_READY(self) == -1)
10628 return -1;
10629 len = PyUnicode_GET_LENGTH(self);
10630
10631 /* The hash function as a macro, gets expanded three times below. */
10632#define HASH(P) \
10633 x = (Py_uhash_t)*P << 7; \
10634 while (--len >= 0) \
10635 x = (1000003*x) ^ (Py_uhash_t)*P++;
10636
10637 switch (PyUnicode_KIND(self)) {
10638 case PyUnicode_1BYTE_KIND: {
10639 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10640 HASH(c);
10641 break;
10642 }
10643 case PyUnicode_2BYTE_KIND: {
10644 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10645 HASH(s);
10646 break;
10647 }
10648 default: {
10649 Py_UCS4 *l;
10650 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10651 "Impossible switch case in unicode_hash");
10652 l = PyUnicode_4BYTE_DATA(self);
10653 HASH(l);
10654 break;
10655 }
10656 }
10657 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
10658
Guido van Rossumc2504932007-09-18 19:42:40 +000010659 if (x == -1)
10660 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010661 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010662 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010663}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010664#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000010665
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010666PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010667 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010668\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010669Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010670
10671static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010672unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010673{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010674 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +020010675 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010676 Py_ssize_t start;
10677 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010678
Jesus Ceaac451502011-04-20 17:09:23 +020010679 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10680 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010681 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010682
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010683 if (PyUnicode_READY(self) == -1)
10684 return NULL;
10685 if (PyUnicode_READY(substring) == -1)
10686 return NULL;
10687
10688 result = any_find_slice(
Victor Stinnerc3cec782011-10-05 21:24:08 +020010689 asciilib_find_slice, ucs1lib_find_slice,
10690 ucs2lib_find_slice, ucs4lib_find_slice,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010691 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010692 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010693
10694 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010695
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010696 if (result == -2)
10697 return NULL;
10698
Guido van Rossumd57fd912000-03-10 22:53:23 +000010699 if (result < 0) {
10700 PyErr_SetString(PyExc_ValueError, "substring not found");
10701 return NULL;
10702 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010703
Christian Heimes217cfd12007-12-02 14:31:20 +000010704 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010705}
10706
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010707PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010708 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010709\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010710Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010711at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010712
10713static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010714unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010715{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010716 Py_ssize_t i, length;
10717 int kind;
10718 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010719 int cased;
10720
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010721 if (PyUnicode_READY(self) == -1)
10722 return NULL;
10723 length = PyUnicode_GET_LENGTH(self);
10724 kind = PyUnicode_KIND(self);
10725 data = PyUnicode_DATA(self);
10726
Guido van Rossumd57fd912000-03-10 22:53:23 +000010727 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010728 if (length == 1)
10729 return PyBool_FromLong(
10730 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010731
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010732 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010733 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010734 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010735
Guido van Rossumd57fd912000-03-10 22:53:23 +000010736 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010737 for (i = 0; i < length; i++) {
10738 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010739
Benjamin Peterson29060642009-01-31 22:14:21 +000010740 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
10741 return PyBool_FromLong(0);
10742 else if (!cased && Py_UNICODE_ISLOWER(ch))
10743 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010744 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010745 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010746}
10747
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010748PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010749 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010750\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010751Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010752at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010753
10754static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010755unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010756{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010757 Py_ssize_t i, length;
10758 int kind;
10759 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010760 int cased;
10761
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010762 if (PyUnicode_READY(self) == -1)
10763 return NULL;
10764 length = PyUnicode_GET_LENGTH(self);
10765 kind = PyUnicode_KIND(self);
10766 data = PyUnicode_DATA(self);
10767
Guido van Rossumd57fd912000-03-10 22:53:23 +000010768 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010769 if (length == 1)
10770 return PyBool_FromLong(
10771 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010772
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010773 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010774 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010775 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010776
Guido van Rossumd57fd912000-03-10 22:53:23 +000010777 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010778 for (i = 0; i < length; i++) {
10779 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010780
Benjamin Peterson29060642009-01-31 22:14:21 +000010781 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
10782 return PyBool_FromLong(0);
10783 else if (!cased && Py_UNICODE_ISUPPER(ch))
10784 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010785 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010786 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010787}
10788
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010789PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010790 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010791\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010792Return True if S is a titlecased string and there is at least one\n\
10793character in S, i.e. upper- and titlecase characters may only\n\
10794follow uncased characters and lowercase characters only cased ones.\n\
10795Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010796
10797static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010798unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010799{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010800 Py_ssize_t i, length;
10801 int kind;
10802 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010803 int cased, previous_is_cased;
10804
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010805 if (PyUnicode_READY(self) == -1)
10806 return NULL;
10807 length = PyUnicode_GET_LENGTH(self);
10808 kind = PyUnicode_KIND(self);
10809 data = PyUnicode_DATA(self);
10810
Guido van Rossumd57fd912000-03-10 22:53:23 +000010811 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010812 if (length == 1) {
10813 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10814 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10815 (Py_UNICODE_ISUPPER(ch) != 0));
10816 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010817
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010818 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010819 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010820 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010821
Guido van Rossumd57fd912000-03-10 22:53:23 +000010822 cased = 0;
10823 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010824 for (i = 0; i < length; i++) {
10825 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010826
Benjamin Peterson29060642009-01-31 22:14:21 +000010827 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10828 if (previous_is_cased)
10829 return PyBool_FromLong(0);
10830 previous_is_cased = 1;
10831 cased = 1;
10832 }
10833 else if (Py_UNICODE_ISLOWER(ch)) {
10834 if (!previous_is_cased)
10835 return PyBool_FromLong(0);
10836 previous_is_cased = 1;
10837 cased = 1;
10838 }
10839 else
10840 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010841 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010842 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010843}
10844
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010845PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010846 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010847\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010848Return True if all characters in S are whitespace\n\
10849and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010850
10851static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010852unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010853{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010854 Py_ssize_t i, length;
10855 int kind;
10856 void *data;
10857
10858 if (PyUnicode_READY(self) == -1)
10859 return NULL;
10860 length = PyUnicode_GET_LENGTH(self);
10861 kind = PyUnicode_KIND(self);
10862 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010863
Guido van Rossumd57fd912000-03-10 22:53:23 +000010864 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010865 if (length == 1)
10866 return PyBool_FromLong(
10867 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010868
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010869 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010870 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010871 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010872
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010873 for (i = 0; i < length; i++) {
10874 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010875 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010876 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010877 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010878 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010879}
10880
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010881PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010882 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010883\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010884Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010885and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010886
10887static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010888unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010889{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010890 Py_ssize_t i, length;
10891 int kind;
10892 void *data;
10893
10894 if (PyUnicode_READY(self) == -1)
10895 return NULL;
10896 length = PyUnicode_GET_LENGTH(self);
10897 kind = PyUnicode_KIND(self);
10898 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010899
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010900 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010901 if (length == 1)
10902 return PyBool_FromLong(
10903 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010904
10905 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010906 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010907 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010908
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010909 for (i = 0; i < length; i++) {
10910 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010911 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010912 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010913 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010914}
10915
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010916PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010917 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010918\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010919Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010920and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010921
10922static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010923unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010924{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010925 int kind;
10926 void *data;
10927 Py_ssize_t len, i;
10928
10929 if (PyUnicode_READY(self) == -1)
10930 return NULL;
10931
10932 kind = PyUnicode_KIND(self);
10933 data = PyUnicode_DATA(self);
10934 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010935
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010936 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010937 if (len == 1) {
10938 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10939 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10940 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010941
10942 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010943 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010944 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010945
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010946 for (i = 0; i < len; i++) {
10947 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010948 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010949 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010950 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010951 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010952}
10953
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010954PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010955 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010956\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010957Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010958False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010959
10960static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010961unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010962{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010963 Py_ssize_t i, length;
10964 int kind;
10965 void *data;
10966
10967 if (PyUnicode_READY(self) == -1)
10968 return NULL;
10969 length = PyUnicode_GET_LENGTH(self);
10970 kind = PyUnicode_KIND(self);
10971 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010972
Guido van Rossumd57fd912000-03-10 22:53:23 +000010973 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010974 if (length == 1)
10975 return PyBool_FromLong(
10976 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010977
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010978 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010979 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010980 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010981
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010982 for (i = 0; i < length; i++) {
10983 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010984 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010985 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010986 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010987}
10988
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010989PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010990 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010991\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010992Return True if all characters in S are digits\n\
10993and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010994
10995static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010996unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010997{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010998 Py_ssize_t i, length;
10999 int kind;
11000 void *data;
11001
11002 if (PyUnicode_READY(self) == -1)
11003 return NULL;
11004 length = PyUnicode_GET_LENGTH(self);
11005 kind = PyUnicode_KIND(self);
11006 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011007
Guido van Rossumd57fd912000-03-10 22:53:23 +000011008 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011009 if (length == 1) {
11010 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11011 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11012 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011013
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011014 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011015 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011016 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011017
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011018 for (i = 0; i < length; i++) {
11019 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011020 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011021 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011022 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011023}
11024
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011025PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011026 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011027\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011028Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011029False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011030
11031static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011032unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011033{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011034 Py_ssize_t i, length;
11035 int kind;
11036 void *data;
11037
11038 if (PyUnicode_READY(self) == -1)
11039 return NULL;
11040 length = PyUnicode_GET_LENGTH(self);
11041 kind = PyUnicode_KIND(self);
11042 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011043
Guido van Rossumd57fd912000-03-10 22:53:23 +000011044 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011045 if (length == 1)
11046 return PyBool_FromLong(
11047 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011048
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011049 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011050 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011051 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011052
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011053 for (i = 0; i < length; i++) {
11054 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011055 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011056 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011057 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011058}
11059
Martin v. Löwis47383402007-08-15 07:32:56 +000011060int
11061PyUnicode_IsIdentifier(PyObject *self)
11062{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011063 int kind;
11064 void *data;
11065 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011066 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011067
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011068 if (PyUnicode_READY(self) == -1) {
11069 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011070 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011071 }
11072
11073 /* Special case for empty strings */
11074 if (PyUnicode_GET_LENGTH(self) == 0)
11075 return 0;
11076 kind = PyUnicode_KIND(self);
11077 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011078
11079 /* PEP 3131 says that the first character must be in
11080 XID_Start and subsequent characters in XID_Continue,
11081 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011082 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011083 letters, digits, underscore). However, given the current
11084 definition of XID_Start and XID_Continue, it is sufficient
11085 to check just for these, except that _ must be allowed
11086 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011087 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011088 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011089 return 0;
11090
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011091 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011092 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011093 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011094 return 1;
11095}
11096
11097PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011098 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011099\n\
11100Return True if S is a valid identifier according\n\
11101to the language definition.");
11102
11103static PyObject*
11104unicode_isidentifier(PyObject *self)
11105{
11106 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11107}
11108
Georg Brandl559e5d72008-06-11 18:37:52 +000011109PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011110 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011111\n\
11112Return True if all characters in S are considered\n\
11113printable in repr() or S is empty, False otherwise.");
11114
11115static PyObject*
11116unicode_isprintable(PyObject *self)
11117{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011118 Py_ssize_t i, length;
11119 int kind;
11120 void *data;
11121
11122 if (PyUnicode_READY(self) == -1)
11123 return NULL;
11124 length = PyUnicode_GET_LENGTH(self);
11125 kind = PyUnicode_KIND(self);
11126 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011127
11128 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011129 if (length == 1)
11130 return PyBool_FromLong(
11131 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011132
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011133 for (i = 0; i < length; i++) {
11134 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011135 Py_RETURN_FALSE;
11136 }
11137 }
11138 Py_RETURN_TRUE;
11139}
11140
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011141PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011142 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011143\n\
11144Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011145iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011146
11147static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011148unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011149{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011150 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011151}
11152
Martin v. Löwis18e16552006-02-15 17:27:45 +000011153static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000011154unicode_length(PyUnicodeObject *self)
11155{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011156 if (PyUnicode_READY(self) == -1)
11157 return -1;
11158 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011159}
11160
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011161PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011162 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011163\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011164Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011165done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011166
11167static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011168unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011169{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011170 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011171 Py_UCS4 fillchar = ' ';
11172
11173 if (PyUnicode_READY(self) == -1)
11174 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011175
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011176 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011177 return NULL;
11178
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011179 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011180 Py_INCREF(self);
11181 return (PyObject*) self;
11182 }
11183
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011184 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011185}
11186
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011187PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011188 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011189\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011190Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011191
11192static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011193unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011194{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011195 return fixup(self, fixlower);
11196}
11197
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011198#define LEFTSTRIP 0
11199#define RIGHTSTRIP 1
11200#define BOTHSTRIP 2
11201
11202/* Arrays indexed by above */
11203static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11204
11205#define STRIPNAME(i) (stripformat[i]+3)
11206
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011207/* externally visible for str.strip(unicode) */
11208PyObject *
11209_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
11210{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011211 void *data;
11212 int kind;
11213 Py_ssize_t i, j, len;
11214 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011215
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011216 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11217 return NULL;
11218
11219 kind = PyUnicode_KIND(self);
11220 data = PyUnicode_DATA(self);
11221 len = PyUnicode_GET_LENGTH(self);
11222 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11223 PyUnicode_DATA(sepobj),
11224 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011225
Benjamin Peterson14339b62009-01-31 16:36:08 +000011226 i = 0;
11227 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011228 while (i < len &&
11229 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011230 i++;
11231 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011232 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011233
Benjamin Peterson14339b62009-01-31 16:36:08 +000011234 j = len;
11235 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011236 do {
11237 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011238 } while (j >= i &&
11239 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011240 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011241 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011242
Victor Stinner12bab6d2011-10-01 01:53:49 +020011243 return PyUnicode_Substring((PyObject*)self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011244}
11245
11246PyObject*
11247PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11248{
11249 unsigned char *data;
11250 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011251 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011252
Victor Stinnerde636f32011-10-01 03:55:54 +020011253 if (PyUnicode_READY(self) == -1)
11254 return NULL;
11255
11256 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11257
Victor Stinner12bab6d2011-10-01 01:53:49 +020011258 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011259 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011260 if (PyUnicode_CheckExact(self)) {
11261 Py_INCREF(self);
11262 return self;
11263 }
11264 else
11265 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011266 }
11267
Victor Stinner12bab6d2011-10-01 01:53:49 +020011268 length = end - start;
11269 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011270 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011271
Victor Stinnerde636f32011-10-01 03:55:54 +020011272 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011273 PyErr_SetString(PyExc_IndexError, "string index out of range");
11274 return NULL;
11275 }
11276
Victor Stinnerb9275c12011-10-05 14:01:42 +020011277 if (PyUnicode_IS_ASCII(self)) {
11278 kind = PyUnicode_KIND(self);
11279 data = PyUnicode_1BYTE_DATA(self);
11280 return unicode_fromascii(data + start, length);
11281 }
11282 else {
11283 kind = PyUnicode_KIND(self);
11284 data = PyUnicode_1BYTE_DATA(self);
11285 return PyUnicode_FromKindAndData(kind,
11286 data + PyUnicode_KIND_SIZE(kind, start),
11287 length);
11288 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011289}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011290
11291static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011292do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011293{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011294 int kind;
11295 void *data;
11296 Py_ssize_t len, i, j;
11297
11298 if (PyUnicode_READY(self) == -1)
11299 return NULL;
11300
11301 kind = PyUnicode_KIND(self);
11302 data = PyUnicode_DATA(self);
11303 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011304
Benjamin Peterson14339b62009-01-31 16:36:08 +000011305 i = 0;
11306 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011307 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011308 i++;
11309 }
11310 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011311
Benjamin Peterson14339b62009-01-31 16:36:08 +000011312 j = len;
11313 if (striptype != LEFTSTRIP) {
11314 do {
11315 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011316 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011317 j++;
11318 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011319
Victor Stinner12bab6d2011-10-01 01:53:49 +020011320 return PyUnicode_Substring((PyObject*)self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011321}
11322
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011323
11324static PyObject *
11325do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
11326{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011327 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011328
Benjamin Peterson14339b62009-01-31 16:36:08 +000011329 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11330 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011331
Benjamin Peterson14339b62009-01-31 16:36:08 +000011332 if (sep != NULL && sep != Py_None) {
11333 if (PyUnicode_Check(sep))
11334 return _PyUnicode_XStrip(self, striptype, sep);
11335 else {
11336 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011337 "%s arg must be None or str",
11338 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011339 return NULL;
11340 }
11341 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011342
Benjamin Peterson14339b62009-01-31 16:36:08 +000011343 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011344}
11345
11346
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011347PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011348 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011349\n\
11350Return a copy of the string S with leading and trailing\n\
11351whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011352If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011353
11354static PyObject *
11355unicode_strip(PyUnicodeObject *self, PyObject *args)
11356{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011357 if (PyTuple_GET_SIZE(args) == 0)
11358 return do_strip(self, BOTHSTRIP); /* Common case */
11359 else
11360 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011361}
11362
11363
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011364PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011365 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011366\n\
11367Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011368If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011369
11370static PyObject *
11371unicode_lstrip(PyUnicodeObject *self, PyObject *args)
11372{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011373 if (PyTuple_GET_SIZE(args) == 0)
11374 return do_strip(self, LEFTSTRIP); /* Common case */
11375 else
11376 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011377}
11378
11379
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011380PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011381 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011382\n\
11383Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011384If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011385
11386static PyObject *
11387unicode_rstrip(PyUnicodeObject *self, PyObject *args)
11388{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011389 if (PyTuple_GET_SIZE(args) == 0)
11390 return do_strip(self, RIGHTSTRIP); /* Common case */
11391 else
11392 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011393}
11394
11395
Guido van Rossumd57fd912000-03-10 22:53:23 +000011396static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000011397unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011398{
11399 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011400 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011401
Georg Brandl222de0f2009-04-12 12:01:50 +000011402 if (len < 1) {
11403 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011404 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011405 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011406
Tim Peters7a29bd52001-09-12 03:03:31 +000011407 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011408 /* no repeat, return original string */
11409 Py_INCREF(str);
11410 return (PyObject*) str;
11411 }
Tim Peters8f422462000-09-09 06:13:41 +000011412
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011413 if (PyUnicode_READY(str) == -1)
11414 return NULL;
11415
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011416 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011417 PyErr_SetString(PyExc_OverflowError,
11418 "repeated string is too long");
11419 return NULL;
11420 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011421 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011422
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011423 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011424 if (!u)
11425 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011426 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011427
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011428 if (PyUnicode_GET_LENGTH(str) == 1) {
11429 const int kind = PyUnicode_KIND(str);
11430 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11431 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011432 if (kind == PyUnicode_1BYTE_KIND)
11433 memset(to, (unsigned char)fill_char, len);
11434 else {
11435 for (n = 0; n < len; ++n)
11436 PyUnicode_WRITE(kind, to, n, fill_char);
11437 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011438 }
11439 else {
11440 /* number of characters copied this far */
11441 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
11442 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
11443 char *to = (char *) PyUnicode_DATA(u);
11444 Py_MEMCPY(to, PyUnicode_DATA(str),
11445 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011446 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011447 n = (done <= nchars-done) ? done : nchars-done;
11448 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011449 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011450 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011451 }
11452
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011453 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011454 return (PyObject*) u;
11455}
11456
Alexander Belopolsky40018472011-02-26 01:02:56 +000011457PyObject *
11458PyUnicode_Replace(PyObject *obj,
11459 PyObject *subobj,
11460 PyObject *replobj,
11461 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011462{
11463 PyObject *self;
11464 PyObject *str1;
11465 PyObject *str2;
11466 PyObject *result;
11467
11468 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011469 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011470 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011471 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011472 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011473 Py_DECREF(self);
11474 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011475 }
11476 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011477 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011478 Py_DECREF(self);
11479 Py_DECREF(str1);
11480 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011481 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011482 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011483 Py_DECREF(self);
11484 Py_DECREF(str1);
11485 Py_DECREF(str2);
11486 return result;
11487}
11488
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011489PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011490 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011491\n\
11492Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011493old replaced by new. If the optional argument count is\n\
11494given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011495
11496static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011497unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011498{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011499 PyObject *str1;
11500 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011501 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011502 PyObject *result;
11503
Martin v. Löwis18e16552006-02-15 17:27:45 +000011504 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011505 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011506 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011507 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011508 str1 = PyUnicode_FromObject(str1);
11509 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11510 return NULL;
11511 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011512 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011513 Py_DECREF(str1);
11514 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011515 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011516
11517 result = replace(self, str1, str2, maxcount);
11518
11519 Py_DECREF(str1);
11520 Py_DECREF(str2);
11521 return result;
11522}
11523
Alexander Belopolsky40018472011-02-26 01:02:56 +000011524static PyObject *
11525unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011526{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011527 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011528 Py_ssize_t isize;
11529 Py_ssize_t osize, squote, dquote, i, o;
11530 Py_UCS4 max, quote;
11531 int ikind, okind;
11532 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011533
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011534 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011535 return NULL;
11536
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011537 isize = PyUnicode_GET_LENGTH(unicode);
11538 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011539
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011540 /* Compute length of output, quote characters, and
11541 maximum character */
11542 osize = 2; /* quotes */
11543 max = 127;
11544 squote = dquote = 0;
11545 ikind = PyUnicode_KIND(unicode);
11546 for (i = 0; i < isize; i++) {
11547 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11548 switch (ch) {
11549 case '\'': squote++; osize++; break;
11550 case '"': dquote++; osize++; break;
11551 case '\\': case '\t': case '\r': case '\n':
11552 osize += 2; break;
11553 default:
11554 /* Fast-path ASCII */
11555 if (ch < ' ' || ch == 0x7f)
11556 osize += 4; /* \xHH */
11557 else if (ch < 0x7f)
11558 osize++;
11559 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11560 osize++;
11561 max = ch > max ? ch : max;
11562 }
11563 else if (ch < 0x100)
11564 osize += 4; /* \xHH */
11565 else if (ch < 0x10000)
11566 osize += 6; /* \uHHHH */
11567 else
11568 osize += 10; /* \uHHHHHHHH */
11569 }
11570 }
11571
11572 quote = '\'';
11573 if (squote) {
11574 if (dquote)
11575 /* Both squote and dquote present. Use squote,
11576 and escape them */
11577 osize += squote;
11578 else
11579 quote = '"';
11580 }
11581
11582 repr = PyUnicode_New(osize, max);
11583 if (repr == NULL)
11584 return NULL;
11585 okind = PyUnicode_KIND(repr);
11586 odata = PyUnicode_DATA(repr);
11587
11588 PyUnicode_WRITE(okind, odata, 0, quote);
11589 PyUnicode_WRITE(okind, odata, osize-1, quote);
11590
11591 for (i = 0, o = 1; i < isize; i++) {
11592 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011593
11594 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011595 if ((ch == quote) || (ch == '\\')) {
11596 PyUnicode_WRITE(okind, odata, o++, '\\');
11597 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011598 continue;
11599 }
11600
Benjamin Peterson29060642009-01-31 22:14:21 +000011601 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011602 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011603 PyUnicode_WRITE(okind, odata, o++, '\\');
11604 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011605 }
11606 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011607 PyUnicode_WRITE(okind, odata, o++, '\\');
11608 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011609 }
11610 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011611 PyUnicode_WRITE(okind, odata, o++, '\\');
11612 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011613 }
11614
11615 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011616 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011617 PyUnicode_WRITE(okind, odata, o++, '\\');
11618 PyUnicode_WRITE(okind, odata, o++, 'x');
11619 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11620 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011621 }
11622
Georg Brandl559e5d72008-06-11 18:37:52 +000011623 /* Copy ASCII characters as-is */
11624 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011625 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011626 }
11627
Benjamin Peterson29060642009-01-31 22:14:21 +000011628 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011629 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011630 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011631 (categories Z* and C* except ASCII space)
11632 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011633 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011634 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011635 if (ch <= 0xff) {
11636 PyUnicode_WRITE(okind, odata, o++, '\\');
11637 PyUnicode_WRITE(okind, odata, o++, 'x');
11638 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11639 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011640 }
11641 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011642 else if (ch >= 0x10000) {
11643 PyUnicode_WRITE(okind, odata, o++, '\\');
11644 PyUnicode_WRITE(okind, odata, o++, 'U');
11645 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
11646 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
11647 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
11648 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
11649 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11650 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11651 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11652 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011653 }
11654 /* Map 16-bit characters to '\uxxxx' */
11655 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011656 PyUnicode_WRITE(okind, odata, o++, '\\');
11657 PyUnicode_WRITE(okind, odata, o++, 'u');
11658 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11659 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11660 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11661 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011662 }
11663 }
11664 /* Copy characters as-is */
11665 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011666 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011667 }
11668 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000011669 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011670 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020011671 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000011672 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011673}
11674
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011675PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011676 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011677\n\
11678Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011679such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011680arguments start and end are interpreted as in slice notation.\n\
11681\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011682Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011683
11684static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011685unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011686{
Jesus Ceaac451502011-04-20 17:09:23 +020011687 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011688 Py_ssize_t start;
11689 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011690 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011691
Jesus Ceaac451502011-04-20 17:09:23 +020011692 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11693 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011694 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011695
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011696 if (PyUnicode_READY(self) == -1)
11697 return NULL;
11698 if (PyUnicode_READY(substring) == -1)
11699 return NULL;
11700
11701 result = any_find_slice(
Victor Stinnerc3cec782011-10-05 21:24:08 +020011702 asciilib_rfind_slice, ucs1lib_rfind_slice,
11703 ucs2lib_rfind_slice, ucs4lib_rfind_slice,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011704 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011705 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011706
11707 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011708
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011709 if (result == -2)
11710 return NULL;
11711
Christian Heimes217cfd12007-12-02 14:31:20 +000011712 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011713}
11714
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011715PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011716 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011717\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011718Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011719
11720static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011721unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011722{
Jesus Ceaac451502011-04-20 17:09:23 +020011723 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011724 Py_ssize_t start;
11725 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011726 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011727
Jesus Ceaac451502011-04-20 17:09:23 +020011728 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
11729 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011730 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011731
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011732 if (PyUnicode_READY(self) == -1)
11733 return NULL;
11734 if (PyUnicode_READY(substring) == -1)
11735 return NULL;
11736
11737 result = any_find_slice(
Victor Stinnerc3cec782011-10-05 21:24:08 +020011738 asciilib_rfind_slice, ucs1lib_rfind_slice,
11739 ucs2lib_rfind_slice, ucs4lib_rfind_slice,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011740 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011741 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011742
11743 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011744
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011745 if (result == -2)
11746 return NULL;
11747
Guido van Rossumd57fd912000-03-10 22:53:23 +000011748 if (result < 0) {
11749 PyErr_SetString(PyExc_ValueError, "substring not found");
11750 return NULL;
11751 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011752
Christian Heimes217cfd12007-12-02 14:31:20 +000011753 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011754}
11755
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011756PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011757 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011758\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011759Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011760done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011761
11762static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011763unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011764{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011765 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011766 Py_UCS4 fillchar = ' ';
11767
Victor Stinnere9a29352011-10-01 02:14:59 +020011768 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011769 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011770
Victor Stinnere9a29352011-10-01 02:14:59 +020011771 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011772 return NULL;
11773
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011774 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011775 Py_INCREF(self);
11776 return (PyObject*) self;
11777 }
11778
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011779 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011780}
11781
Alexander Belopolsky40018472011-02-26 01:02:56 +000011782PyObject *
11783PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011784{
11785 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000011786
Guido van Rossumd57fd912000-03-10 22:53:23 +000011787 s = PyUnicode_FromObject(s);
11788 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011789 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011790 if (sep != NULL) {
11791 sep = PyUnicode_FromObject(sep);
11792 if (sep == NULL) {
11793 Py_DECREF(s);
11794 return NULL;
11795 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011796 }
11797
Victor Stinner9310abb2011-10-05 00:59:23 +020011798 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011799
11800 Py_DECREF(s);
11801 Py_XDECREF(sep);
11802 return result;
11803}
11804
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011805PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011806 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011807\n\
11808Return a list of the words in S, using sep as the\n\
11809delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000011810splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000011811whitespace string is a separator and empty strings are\n\
11812removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011813
11814static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011815unicode_split(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011816{
11817 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011818 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011819
Martin v. Löwis18e16552006-02-15 17:27:45 +000011820 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011821 return NULL;
11822
11823 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011824 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011825 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020011826 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011827 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011828 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011829}
11830
Thomas Wouters477c8d52006-05-27 19:21:47 +000011831PyObject *
11832PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11833{
11834 PyObject* str_obj;
11835 PyObject* sep_obj;
11836 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011837 int kind1, kind2, kind;
11838 void *buf1 = NULL, *buf2 = NULL;
11839 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011840
11841 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020011842 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011843 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011844 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011845 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011846 Py_DECREF(str_obj);
11847 return NULL;
11848 }
11849
Victor Stinner14f8f022011-10-05 20:58:25 +020011850 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011851 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020011852 kind = Py_MAX(kind1, kind2);
11853 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011854 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020011855 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011856 if (!buf1)
11857 goto onError;
11858 buf2 = PyUnicode_DATA(sep_obj);
11859 if (kind2 != kind)
11860 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11861 if (!buf2)
11862 goto onError;
11863 len1 = PyUnicode_GET_LENGTH(str_obj);
11864 len2 = PyUnicode_GET_LENGTH(sep_obj);
11865
Victor Stinner14f8f022011-10-05 20:58:25 +020011866 switch(PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011867 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020011868 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
11869 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11870 else
11871 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011872 break;
11873 case PyUnicode_2BYTE_KIND:
11874 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11875 break;
11876 case PyUnicode_4BYTE_KIND:
11877 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11878 break;
11879 default:
11880 assert(0);
11881 out = 0;
11882 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011883
11884 Py_DECREF(sep_obj);
11885 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011886 if (kind1 != kind)
11887 PyMem_Free(buf1);
11888 if (kind2 != kind)
11889 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011890
11891 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011892 onError:
11893 Py_DECREF(sep_obj);
11894 Py_DECREF(str_obj);
11895 if (kind1 != kind && buf1)
11896 PyMem_Free(buf1);
11897 if (kind2 != kind && buf2)
11898 PyMem_Free(buf2);
11899 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011900}
11901
11902
11903PyObject *
11904PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11905{
11906 PyObject* str_obj;
11907 PyObject* sep_obj;
11908 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011909 int kind1, kind2, kind;
11910 void *buf1 = NULL, *buf2 = NULL;
11911 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011912
11913 str_obj = PyUnicode_FromObject(str_in);
11914 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011915 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011916 sep_obj = PyUnicode_FromObject(sep_in);
11917 if (!sep_obj) {
11918 Py_DECREF(str_obj);
11919 return NULL;
11920 }
11921
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011922 kind1 = PyUnicode_KIND(str_in);
11923 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011924 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011925 buf1 = PyUnicode_DATA(str_in);
11926 if (kind1 != kind)
11927 buf1 = _PyUnicode_AsKind(str_in, kind);
11928 if (!buf1)
11929 goto onError;
11930 buf2 = PyUnicode_DATA(sep_obj);
11931 if (kind2 != kind)
11932 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11933 if (!buf2)
11934 goto onError;
11935 len1 = PyUnicode_GET_LENGTH(str_obj);
11936 len2 = PyUnicode_GET_LENGTH(sep_obj);
11937
11938 switch(PyUnicode_KIND(str_in)) {
11939 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020011940 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
11941 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11942 else
11943 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011944 break;
11945 case PyUnicode_2BYTE_KIND:
11946 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11947 break;
11948 case PyUnicode_4BYTE_KIND:
11949 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11950 break;
11951 default:
11952 assert(0);
11953 out = 0;
11954 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011955
11956 Py_DECREF(sep_obj);
11957 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011958 if (kind1 != kind)
11959 PyMem_Free(buf1);
11960 if (kind2 != kind)
11961 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011962
11963 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011964 onError:
11965 Py_DECREF(sep_obj);
11966 Py_DECREF(str_obj);
11967 if (kind1 != kind && buf1)
11968 PyMem_Free(buf1);
11969 if (kind2 != kind && buf2)
11970 PyMem_Free(buf2);
11971 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011972}
11973
11974PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011975 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011976\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011977Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011978the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011979found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011980
11981static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011982unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011983{
Victor Stinner9310abb2011-10-05 00:59:23 +020011984 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011985}
11986
11987PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000011988 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011989\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011990Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011991the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011992separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011993
11994static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011995unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011996{
Victor Stinner9310abb2011-10-05 00:59:23 +020011997 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011998}
11999
Alexander Belopolsky40018472011-02-26 01:02:56 +000012000PyObject *
12001PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012002{
12003 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012004
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012005 s = PyUnicode_FromObject(s);
12006 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012007 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012008 if (sep != NULL) {
12009 sep = PyUnicode_FromObject(sep);
12010 if (sep == NULL) {
12011 Py_DECREF(s);
12012 return NULL;
12013 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012014 }
12015
Victor Stinner9310abb2011-10-05 00:59:23 +020012016 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012017
12018 Py_DECREF(s);
12019 Py_XDECREF(sep);
12020 return result;
12021}
12022
12023PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012024 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012025\n\
12026Return a list of the words in S, using sep as the\n\
12027delimiter string, starting at the end of the string and\n\
12028working to the front. If maxsplit is given, at most maxsplit\n\
12029splits are done. If sep is not specified, any whitespace string\n\
12030is a separator.");
12031
12032static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012033unicode_rsplit(PyObject *self, PyObject *args)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012034{
12035 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012036 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012037
Martin v. Löwis18e16552006-02-15 17:27:45 +000012038 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012039 return NULL;
12040
12041 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012042 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012043 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012044 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012045 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012046 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012047}
12048
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012049PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012050 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012051\n\
12052Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012053Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012054is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012055
12056static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012057unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012058{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012059 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012060 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012061
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012062 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12063 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012064 return NULL;
12065
Guido van Rossum86662912000-04-11 15:38:46 +000012066 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012067}
12068
12069static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012070PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012071{
Walter Dörwald346737f2007-05-31 10:44:43 +000012072 if (PyUnicode_CheckExact(self)) {
12073 Py_INCREF(self);
12074 return self;
12075 } else
12076 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020012077 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012078}
12079
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012080PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012081 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012082\n\
12083Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012084and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012085
12086static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012087unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012088{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012089 return fixup(self, fixswapcase);
12090}
12091
Georg Brandlceee0772007-11-27 23:48:05 +000012092PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012093 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012094\n\
12095Return a translation table usable for str.translate().\n\
12096If there is only one argument, it must be a dictionary mapping Unicode\n\
12097ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012098Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012099If there are two arguments, they must be strings of equal length, and\n\
12100in the resulting dictionary, each character in x will be mapped to the\n\
12101character at the same position in y. If there is a third argument, it\n\
12102must be a string, whose characters will be mapped to None in the result.");
12103
12104static PyObject*
12105unicode_maketrans(PyUnicodeObject *null, PyObject *args)
12106{
12107 PyObject *x, *y = NULL, *z = NULL;
12108 PyObject *new = NULL, *key, *value;
12109 Py_ssize_t i = 0;
12110 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012111
Georg Brandlceee0772007-11-27 23:48:05 +000012112 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12113 return NULL;
12114 new = PyDict_New();
12115 if (!new)
12116 return NULL;
12117 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012118 int x_kind, y_kind, z_kind;
12119 void *x_data, *y_data, *z_data;
12120
Georg Brandlceee0772007-11-27 23:48:05 +000012121 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012122 if (!PyUnicode_Check(x)) {
12123 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12124 "be a string if there is a second argument");
12125 goto err;
12126 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012127 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012128 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12129 "arguments must have equal length");
12130 goto err;
12131 }
12132 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012133 x_kind = PyUnicode_KIND(x);
12134 y_kind = PyUnicode_KIND(y);
12135 x_data = PyUnicode_DATA(x);
12136 y_data = PyUnicode_DATA(y);
12137 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12138 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12139 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012140 if (!key || !value)
12141 goto err;
12142 res = PyDict_SetItem(new, key, value);
12143 Py_DECREF(key);
12144 Py_DECREF(value);
12145 if (res < 0)
12146 goto err;
12147 }
12148 /* create entries for deleting chars in z */
12149 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012150 z_kind = PyUnicode_KIND(z);
12151 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000012152 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012153 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012154 if (!key)
12155 goto err;
12156 res = PyDict_SetItem(new, key, Py_None);
12157 Py_DECREF(key);
12158 if (res < 0)
12159 goto err;
12160 }
12161 }
12162 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012163 int kind;
12164 void *data;
12165
Georg Brandlceee0772007-11-27 23:48:05 +000012166 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012167 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012168 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12169 "to maketrans it must be a dict");
12170 goto err;
12171 }
12172 /* copy entries into the new dict, converting string keys to int keys */
12173 while (PyDict_Next(x, &i, &key, &value)) {
12174 if (PyUnicode_Check(key)) {
12175 /* convert string keys to integer keys */
12176 PyObject *newkey;
12177 if (PyUnicode_GET_SIZE(key) != 1) {
12178 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12179 "table must be of length 1");
12180 goto err;
12181 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012182 kind = PyUnicode_KIND(key);
12183 data = PyUnicode_DATA(key);
12184 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012185 if (!newkey)
12186 goto err;
12187 res = PyDict_SetItem(new, newkey, value);
12188 Py_DECREF(newkey);
12189 if (res < 0)
12190 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012191 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012192 /* just keep integer keys */
12193 if (PyDict_SetItem(new, key, value) < 0)
12194 goto err;
12195 } else {
12196 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12197 "be strings or integers");
12198 goto err;
12199 }
12200 }
12201 }
12202 return new;
12203 err:
12204 Py_DECREF(new);
12205 return NULL;
12206}
12207
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012208PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012209 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012210\n\
12211Return a copy of the string S, where all characters have been mapped\n\
12212through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012213Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012214Unmapped characters are left untouched. Characters mapped to None\n\
12215are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012216
12217static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012218unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012219{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012220 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012221}
12222
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012223PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012224 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012225\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012226Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012227
12228static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012229unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012230{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012231 return fixup(self, fixupper);
12232}
12233
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012234PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012235 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012236\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012237Pad a numeric string S with zeros on the left, to fill a field\n\
12238of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012239
12240static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012241unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012242{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012243 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012244 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012245 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012246 int kind;
12247 void *data;
12248 Py_UCS4 chr;
12249
12250 if (PyUnicode_READY(self) == -1)
12251 return NULL;
12252
Martin v. Löwis18e16552006-02-15 17:27:45 +000012253 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012254 return NULL;
12255
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012256 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012257 if (PyUnicode_CheckExact(self)) {
12258 Py_INCREF(self);
12259 return (PyObject*) self;
12260 }
12261 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020012262 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012263 }
12264
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012265 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012266
12267 u = pad(self, fill, 0, '0');
12268
Walter Dörwald068325e2002-04-15 13:36:47 +000012269 if (u == NULL)
12270 return NULL;
12271
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012272 kind = PyUnicode_KIND(u);
12273 data = PyUnicode_DATA(u);
12274 chr = PyUnicode_READ(kind, data, fill);
12275
12276 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012277 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012278 PyUnicode_WRITE(kind, data, 0, chr);
12279 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012280 }
12281
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012282 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012283 return (PyObject*) u;
12284}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012285
12286#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012287static PyObject *
12288unicode__decimal2ascii(PyObject *self)
12289{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012290 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012291}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012292#endif
12293
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012294PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012295 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012296\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012297Return True if S starts with the specified prefix, False otherwise.\n\
12298With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012299With optional end, stop comparing S at that position.\n\
12300prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012301
12302static PyObject *
12303unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012304 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012305{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012306 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012307 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012308 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012309 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012310 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012311
Jesus Ceaac451502011-04-20 17:09:23 +020012312 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012313 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012314 if (PyTuple_Check(subobj)) {
12315 Py_ssize_t i;
12316 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12317 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012318 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012319 if (substring == NULL)
12320 return NULL;
12321 result = tailmatch(self, substring, start, end, -1);
12322 Py_DECREF(substring);
12323 if (result) {
12324 Py_RETURN_TRUE;
12325 }
12326 }
12327 /* nothing matched */
12328 Py_RETURN_FALSE;
12329 }
12330 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012331 if (substring == NULL) {
12332 if (PyErr_ExceptionMatches(PyExc_TypeError))
12333 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12334 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012335 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012336 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012337 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012338 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012339 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012340}
12341
12342
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012343PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012344 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012345\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012346Return True if S ends with the specified suffix, False otherwise.\n\
12347With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012348With optional end, stop comparing S at that position.\n\
12349suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012350
12351static PyObject *
12352unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012353 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012354{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012355 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012356 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012357 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012358 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012359 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012360
Jesus Ceaac451502011-04-20 17:09:23 +020012361 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012362 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012363 if (PyTuple_Check(subobj)) {
12364 Py_ssize_t i;
12365 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12366 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012367 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012368 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012369 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012370 result = tailmatch(self, substring, start, end, +1);
12371 Py_DECREF(substring);
12372 if (result) {
12373 Py_RETURN_TRUE;
12374 }
12375 }
12376 Py_RETURN_FALSE;
12377 }
12378 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012379 if (substring == NULL) {
12380 if (PyErr_ExceptionMatches(PyExc_TypeError))
12381 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12382 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012383 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012384 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012385 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012386 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012387 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012388}
12389
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012390#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012391
12392PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012393 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012394\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012395Return a formatted version of S, using substitutions from args and kwargs.\n\
12396The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012397
Eric Smith27bbca62010-11-04 17:06:58 +000012398PyDoc_STRVAR(format_map__doc__,
12399 "S.format_map(mapping) -> str\n\
12400\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012401Return a formatted version of S, using substitutions from mapping.\n\
12402The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012403
Eric Smith4a7d76d2008-05-30 18:10:19 +000012404static PyObject *
12405unicode__format__(PyObject* self, PyObject* args)
12406{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012407 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012408
12409 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12410 return NULL;
12411
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012412 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012413 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012414 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012415}
12416
Eric Smith8c663262007-08-25 02:26:07 +000012417PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012418 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012419\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012420Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012421
12422static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012423unicode__sizeof__(PyUnicodeObject *v)
12424{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012425 Py_ssize_t size;
12426
12427 /* If it's a compact object, account for base structure +
12428 character data. */
12429 if (PyUnicode_IS_COMPACT_ASCII(v))
12430 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12431 else if (PyUnicode_IS_COMPACT(v))
12432 size = sizeof(PyCompactUnicodeObject) +
12433 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
12434 else {
12435 /* If it is a two-block object, account for base object, and
12436 for character block if present. */
12437 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012438 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012439 size += (PyUnicode_GET_LENGTH(v) + 1) *
12440 PyUnicode_CHARACTER_SIZE(v);
12441 }
12442 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012443 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012444 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012445 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012446 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012447 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012448
12449 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012450}
12451
12452PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012453 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012454
12455static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012456unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012457{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012458 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012459 if (!copy)
12460 return NULL;
12461 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012462}
12463
Guido van Rossumd57fd912000-03-10 22:53:23 +000012464static PyMethodDef unicode_methods[] = {
12465
12466 /* Order is according to common usage: often used methods should
12467 appear first, since lookup is done sequentially. */
12468
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012469 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012470 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12471 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012472 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012473 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12474 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12475 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12476 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12477 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12478 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12479 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012480 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012481 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12482 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12483 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012484 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012485 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12486 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12487 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012488 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012489 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012490 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012491 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012492 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12493 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12494 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12495 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12496 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12497 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12498 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12499 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12500 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12501 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12502 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12503 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12504 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12505 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012506 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012507 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012508 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012509 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012510 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012511 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012512 {"maketrans", (PyCFunction) unicode_maketrans,
12513 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012514 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012515#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012516 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012517#endif
12518
12519#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012520 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012521 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012522#endif
12523
Benjamin Peterson14339b62009-01-31 16:36:08 +000012524 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012525 {NULL, NULL}
12526};
12527
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012528static PyObject *
12529unicode_mod(PyObject *v, PyObject *w)
12530{
Brian Curtindfc80e32011-08-10 20:28:54 -050012531 if (!PyUnicode_Check(v))
12532 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012533 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012534}
12535
12536static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012537 0, /*nb_add*/
12538 0, /*nb_subtract*/
12539 0, /*nb_multiply*/
12540 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012541};
12542
Guido van Rossumd57fd912000-03-10 22:53:23 +000012543static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012544 (lenfunc) unicode_length, /* sq_length */
12545 PyUnicode_Concat, /* sq_concat */
12546 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12547 (ssizeargfunc) unicode_getitem, /* sq_item */
12548 0, /* sq_slice */
12549 0, /* sq_ass_item */
12550 0, /* sq_ass_slice */
12551 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012552};
12553
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012554static PyObject*
12555unicode_subscript(PyUnicodeObject* self, PyObject* item)
12556{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012557 if (PyUnicode_READY(self) == -1)
12558 return NULL;
12559
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000012560 if (PyIndex_Check(item)) {
12561 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012562 if (i == -1 && PyErr_Occurred())
12563 return NULL;
12564 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012565 i += PyUnicode_GET_LENGTH(self);
Victor Stinner2fe5ced2011-10-02 00:25:40 +020012566 return unicode_getitem((PyObject*)self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012567 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000012568 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012569 PyObject *result;
12570 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012571 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012572 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012573
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012574 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000012575 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012576 return NULL;
12577 }
12578
12579 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012580 return PyUnicode_New(0, 0);
12581 } else if (start == 0 && step == 1 &&
12582 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000012583 PyUnicode_CheckExact(self)) {
12584 Py_INCREF(self);
12585 return (PyObject *)self;
12586 } else if (step == 1) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012587 return PyUnicode_Substring((PyObject*)self,
12588 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012589 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012590 /* General case */
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012591 max_char = 0;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012592 src_kind = PyUnicode_KIND(self);
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012593 kind_limit = kind_maxchar_limit(src_kind);
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012594 src_data = PyUnicode_DATA(self);
12595 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12596 ch = PyUnicode_READ(src_kind, src_data, cur);
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012597 if (ch > max_char) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012598 max_char = ch;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012599 if (max_char >= kind_limit)
12600 break;
12601 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012602 }
12603 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012604 if (result == NULL)
12605 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012606 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012607 dest_data = PyUnicode_DATA(result);
12608
12609 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012610 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
12611 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012612 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012613 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012614 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012615 } else {
12616 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12617 return NULL;
12618 }
12619}
12620
12621static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012622 (lenfunc)unicode_length, /* mp_length */
12623 (binaryfunc)unicode_subscript, /* mp_subscript */
12624 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012625};
12626
Guido van Rossumd57fd912000-03-10 22:53:23 +000012627
Guido van Rossumd57fd912000-03-10 22:53:23 +000012628/* Helpers for PyUnicode_Format() */
12629
12630static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000012631getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012632{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012633 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012634 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012635 (*p_argidx)++;
12636 if (arglen < 0)
12637 return args;
12638 else
12639 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012640 }
12641 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012642 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012643 return NULL;
12644}
12645
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012646/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012647
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012648static PyObject *
12649formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012650{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012651 char *p;
12652 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012653 double x;
Tim Petersced69f82003-09-16 20:30:58 +000012654
Guido van Rossumd57fd912000-03-10 22:53:23 +000012655 x = PyFloat_AsDouble(v);
12656 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012657 return NULL;
12658
Guido van Rossumd57fd912000-03-10 22:53:23 +000012659 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012660 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000012661
Eric Smith0923d1d2009-04-16 20:16:10 +000012662 p = PyOS_double_to_string(x, type, prec,
12663 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012664 if (p == NULL)
12665 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012666 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000012667 PyMem_Free(p);
12668 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012669}
12670
Tim Peters38fd5b62000-09-21 05:43:11 +000012671static PyObject*
12672formatlong(PyObject *val, int flags, int prec, int type)
12673{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012674 char *buf;
12675 int len;
12676 PyObject *str; /* temporary string object. */
12677 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012678
Benjamin Peterson14339b62009-01-31 16:36:08 +000012679 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
12680 if (!str)
12681 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012682 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012683 Py_DECREF(str);
12684 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012685}
12686
Guido van Rossumd57fd912000-03-10 22:53:23 +000012687static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012688formatchar(Py_UCS4 *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012689 size_t buflen,
12690 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012691{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012692 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012693 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012694 if (PyUnicode_GET_LENGTH(v) == 1) {
12695 buf[0] = PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000012696 buf[1] = '\0';
12697 return 1;
12698 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012699 goto onError;
12700 }
12701 else {
12702 /* Integer input truncated to a character */
12703 long x;
12704 x = PyLong_AsLong(v);
12705 if (x == -1 && PyErr_Occurred())
12706 goto onError;
12707
12708 if (x < 0 || x > 0x10ffff) {
12709 PyErr_SetString(PyExc_OverflowError,
12710 "%c arg not in range(0x110000)");
12711 return -1;
12712 }
12713
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012714 buf[0] = (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012715 buf[1] = '\0';
12716 return 1;
12717 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012718
Benjamin Peterson29060642009-01-31 22:14:21 +000012719 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012720 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012721 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012722 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012723}
12724
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012725/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012726 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012727*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012728#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012729
Alexander Belopolsky40018472011-02-26 01:02:56 +000012730PyObject *
12731PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012732{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012733 void *fmt;
12734 int fmtkind;
12735 PyObject *result;
12736 Py_UCS4 *res, *res0;
12737 Py_UCS4 max;
12738 int kind;
12739 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012740 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012741 PyObject *dict = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012742 PyUnicodeObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +000012743
Guido van Rossumd57fd912000-03-10 22:53:23 +000012744 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012745 PyErr_BadInternalCall();
12746 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012747 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012748 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
12749 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012750 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012751 fmt = PyUnicode_DATA(uformat);
12752 fmtkind = PyUnicode_KIND(uformat);
12753 fmtcnt = PyUnicode_GET_LENGTH(uformat);
12754 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012755
12756 reslen = rescnt = fmtcnt + 100;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012757 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
12758 if (res0 == NULL) {
12759 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012760 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012761 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012762
12763 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012764 arglen = PyTuple_Size(args);
12765 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012766 }
12767 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012768 arglen = -1;
12769 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012770 }
Christian Heimes90aa7642007-12-19 02:45:37 +000012771 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000012772 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000012773 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012774
12775 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012776 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012777 if (--rescnt < 0) {
12778 rescnt = fmtcnt + 100;
12779 reslen += rescnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012780 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12781 if (res0 == NULL){
12782 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012783 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012784 }
12785 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012786 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012787 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012788 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012789 }
12790 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012791 /* Got a format specifier */
12792 int flags = 0;
12793 Py_ssize_t width = -1;
12794 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012795 Py_UCS4 c = '\0';
12796 Py_UCS4 fill;
Benjamin Peterson29060642009-01-31 22:14:21 +000012797 int isnumok;
12798 PyObject *v = NULL;
12799 PyObject *temp = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012800 void *pbuf;
12801 Py_ssize_t pindex;
Benjamin Peterson29060642009-01-31 22:14:21 +000012802 Py_UNICODE sign;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012803 Py_ssize_t len, len1;
12804 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012805
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012806 fmtpos++;
12807 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12808 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000012809 Py_ssize_t keylen;
12810 PyObject *key;
12811 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000012812
Benjamin Peterson29060642009-01-31 22:14:21 +000012813 if (dict == NULL) {
12814 PyErr_SetString(PyExc_TypeError,
12815 "format requires a mapping");
12816 goto onError;
12817 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012818 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012819 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012820 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012821 /* Skip over balanced parentheses */
12822 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012823 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012824 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012825 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012826 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012827 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012828 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012829 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012830 if (fmtcnt < 0 || pcount > 0) {
12831 PyErr_SetString(PyExc_ValueError,
12832 "incomplete format key");
12833 goto onError;
12834 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020012835 key = PyUnicode_Substring((PyObject*)uformat,
12836 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012837 if (key == NULL)
12838 goto onError;
12839 if (args_owned) {
12840 Py_DECREF(args);
12841 args_owned = 0;
12842 }
12843 args = PyObject_GetItem(dict, key);
12844 Py_DECREF(key);
12845 if (args == NULL) {
12846 goto onError;
12847 }
12848 args_owned = 1;
12849 arglen = -1;
12850 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012851 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012852 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012853 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012854 case '-': flags |= F_LJUST; continue;
12855 case '+': flags |= F_SIGN; continue;
12856 case ' ': flags |= F_BLANK; continue;
12857 case '#': flags |= F_ALT; continue;
12858 case '0': flags |= F_ZERO; continue;
12859 }
12860 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012861 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012862 if (c == '*') {
12863 v = getnextarg(args, arglen, &argidx);
12864 if (v == NULL)
12865 goto onError;
12866 if (!PyLong_Check(v)) {
12867 PyErr_SetString(PyExc_TypeError,
12868 "* wants int");
12869 goto onError;
12870 }
12871 width = PyLong_AsLong(v);
12872 if (width == -1 && PyErr_Occurred())
12873 goto onError;
12874 if (width < 0) {
12875 flags |= F_LJUST;
12876 width = -width;
12877 }
12878 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012879 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012880 }
12881 else if (c >= '0' && c <= '9') {
12882 width = c - '0';
12883 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012884 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012885 if (c < '0' || c > '9')
12886 break;
12887 if ((width*10) / 10 != width) {
12888 PyErr_SetString(PyExc_ValueError,
12889 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012890 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012891 }
12892 width = width*10 + (c - '0');
12893 }
12894 }
12895 if (c == '.') {
12896 prec = 0;
12897 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012898 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012899 if (c == '*') {
12900 v = getnextarg(args, arglen, &argidx);
12901 if (v == NULL)
12902 goto onError;
12903 if (!PyLong_Check(v)) {
12904 PyErr_SetString(PyExc_TypeError,
12905 "* wants int");
12906 goto onError;
12907 }
12908 prec = PyLong_AsLong(v);
12909 if (prec == -1 && PyErr_Occurred())
12910 goto onError;
12911 if (prec < 0)
12912 prec = 0;
12913 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012914 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012915 }
12916 else if (c >= '0' && c <= '9') {
12917 prec = c - '0';
12918 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012919 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012920 if (c < '0' || c > '9')
12921 break;
12922 if ((prec*10) / 10 != prec) {
12923 PyErr_SetString(PyExc_ValueError,
12924 "prec too big");
12925 goto onError;
12926 }
12927 prec = prec*10 + (c - '0');
12928 }
12929 }
12930 } /* prec */
12931 if (fmtcnt >= 0) {
12932 if (c == 'h' || c == 'l' || c == 'L') {
12933 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012934 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012935 }
12936 }
12937 if (fmtcnt < 0) {
12938 PyErr_SetString(PyExc_ValueError,
12939 "incomplete format");
12940 goto onError;
12941 }
12942 if (c != '%') {
12943 v = getnextarg(args, arglen, &argidx);
12944 if (v == NULL)
12945 goto onError;
12946 }
12947 sign = 0;
12948 fill = ' ';
12949 switch (c) {
12950
12951 case '%':
12952 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012953 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012954 /* presume that buffer length is at least 1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012955 PyUnicode_WRITE(kind, pbuf, 0, '%');
Benjamin Peterson29060642009-01-31 22:14:21 +000012956 len = 1;
12957 break;
12958
12959 case 's':
12960 case 'r':
12961 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012962 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012963 temp = v;
12964 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012965 }
12966 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012967 if (c == 's')
12968 temp = PyObject_Str(v);
12969 else if (c == 'r')
12970 temp = PyObject_Repr(v);
12971 else
12972 temp = PyObject_ASCII(v);
12973 if (temp == NULL)
12974 goto onError;
12975 if (PyUnicode_Check(temp))
12976 /* nothing to do */;
12977 else {
12978 Py_DECREF(temp);
12979 PyErr_SetString(PyExc_TypeError,
12980 "%s argument has non-string str()");
12981 goto onError;
12982 }
12983 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012984 if (PyUnicode_READY(temp) == -1) {
12985 Py_CLEAR(temp);
12986 goto onError;
12987 }
12988 pbuf = PyUnicode_DATA(temp);
12989 kind = PyUnicode_KIND(temp);
12990 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012991 if (prec >= 0 && len > prec)
12992 len = prec;
12993 break;
12994
12995 case 'i':
12996 case 'd':
12997 case 'u':
12998 case 'o':
12999 case 'x':
13000 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000013001 isnumok = 0;
13002 if (PyNumber_Check(v)) {
13003 PyObject *iobj=NULL;
13004
13005 if (PyLong_Check(v)) {
13006 iobj = v;
13007 Py_INCREF(iobj);
13008 }
13009 else {
13010 iobj = PyNumber_Long(v);
13011 }
13012 if (iobj!=NULL) {
13013 if (PyLong_Check(iobj)) {
13014 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013015 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013016 Py_DECREF(iobj);
13017 if (!temp)
13018 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013019 if (PyUnicode_READY(temp) == -1) {
13020 Py_CLEAR(temp);
13021 goto onError;
13022 }
13023 pbuf = PyUnicode_DATA(temp);
13024 kind = PyUnicode_KIND(temp);
13025 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013026 sign = 1;
13027 }
13028 else {
13029 Py_DECREF(iobj);
13030 }
13031 }
13032 }
13033 if (!isnumok) {
13034 PyErr_Format(PyExc_TypeError,
13035 "%%%c format: a number is required, "
13036 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13037 goto onError;
13038 }
13039 if (flags & F_ZERO)
13040 fill = '0';
13041 break;
13042
13043 case 'e':
13044 case 'E':
13045 case 'f':
13046 case 'F':
13047 case 'g':
13048 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013049 temp = formatfloat(v, flags, prec, c);
13050 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000013051 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013052 if (PyUnicode_READY(temp) == -1) {
13053 Py_CLEAR(temp);
13054 goto onError;
13055 }
13056 pbuf = PyUnicode_DATA(temp);
13057 kind = PyUnicode_KIND(temp);
13058 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013059 sign = 1;
13060 if (flags & F_ZERO)
13061 fill = '0';
13062 break;
13063
13064 case 'c':
13065 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013066 kind = PyUnicode_4BYTE_KIND;
Victor Stinnerb9dcffb2011-09-29 00:39:24 +020013067 len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v);
Benjamin Peterson29060642009-01-31 22:14:21 +000013068 if (len < 0)
13069 goto onError;
13070 break;
13071
13072 default:
13073 PyErr_Format(PyExc_ValueError,
13074 "unsupported format character '%c' (0x%x) "
13075 "at index %zd",
13076 (31<=c && c<=126) ? (char)c : '?',
13077 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013078 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013079 goto onError;
13080 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013081 /* pbuf is initialized here. */
13082 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013083 if (sign) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013084 if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
13085 PyUnicode_READ(kind, pbuf, pindex) == '+') {
13086 sign = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013087 len--;
13088 }
13089 else if (flags & F_SIGN)
13090 sign = '+';
13091 else if (flags & F_BLANK)
13092 sign = ' ';
13093 else
13094 sign = 0;
13095 }
13096 if (width < len)
13097 width = len;
13098 if (rescnt - (sign != 0) < width) {
13099 reslen -= rescnt;
13100 rescnt = width + fmtcnt + 100;
13101 reslen += rescnt;
13102 if (reslen < 0) {
13103 Py_XDECREF(temp);
13104 PyErr_NoMemory();
13105 goto onError;
13106 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013107 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
13108 if (res0 == 0) {
13109 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000013110 Py_XDECREF(temp);
13111 goto onError;
13112 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013113 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000013114 }
13115 if (sign) {
13116 if (fill != ' ')
13117 *res++ = sign;
13118 rescnt--;
13119 if (width > len)
13120 width--;
13121 }
13122 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013123 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13124 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013125 if (fill != ' ') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013126 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
13127 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013128 }
13129 rescnt -= 2;
13130 width -= 2;
13131 if (width < 0)
13132 width = 0;
13133 len -= 2;
13134 }
13135 if (width > len && !(flags & F_LJUST)) {
13136 do {
13137 --rescnt;
13138 *res++ = fill;
13139 } while (--width > len);
13140 }
13141 if (fill == ' ') {
13142 if (sign)
13143 *res++ = sign;
13144 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013145 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13146 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
13147 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
13148 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013149 }
13150 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013151 /* Copy all characters, preserving len */
13152 len1 = len;
13153 while (len1--) {
13154 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
13155 rescnt--;
13156 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013157 while (--width >= len) {
13158 --rescnt;
13159 *res++ = ' ';
13160 }
13161 if (dict && (argidx < arglen) && c != '%') {
13162 PyErr_SetString(PyExc_TypeError,
13163 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +000013164 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013165 goto onError;
13166 }
13167 Py_XDECREF(temp);
13168 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013169 } /* until end */
13170 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013171 PyErr_SetString(PyExc_TypeError,
13172 "not all arguments converted during string formatting");
13173 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013174 }
13175
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013176
13177 for (max=0, res = res0; res < res0+reslen-rescnt; res++)
13178 if (*res > max)
13179 max = *res;
13180 result = PyUnicode_New(reslen - rescnt, max);
13181 if (!result)
Benjamin Peterson29060642009-01-31 22:14:21 +000013182 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013183 kind = PyUnicode_KIND(result);
13184 for (res = res0; res < res0+reslen-rescnt; res++)
13185 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
13186 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013187 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013188 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013189 }
13190 Py_DECREF(uformat);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013191 assert(_PyUnicode_CheckConsistency(result, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000013192 return (PyObject *)result;
13193
Benjamin Peterson29060642009-01-31 22:14:21 +000013194 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013195 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013196 Py_DECREF(uformat);
13197 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013198 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013199 }
13200 return NULL;
13201}
13202
Jeremy Hylton938ace62002-07-17 16:30:39 +000013203static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013204unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13205
Tim Peters6d6c1a32001-08-02 04:15:00 +000013206static PyObject *
13207unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13208{
Benjamin Peterson29060642009-01-31 22:14:21 +000013209 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013210 static char *kwlist[] = {"object", "encoding", "errors", 0};
13211 char *encoding = NULL;
13212 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013213
Benjamin Peterson14339b62009-01-31 16:36:08 +000013214 if (type != &PyUnicode_Type)
13215 return unicode_subtype_new(type, args, kwds);
13216 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013217 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013218 return NULL;
13219 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013220 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013221 if (encoding == NULL && errors == NULL)
13222 return PyObject_Str(x);
13223 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013224 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013225}
13226
Guido van Rossume023fe02001-08-30 03:12:59 +000013227static PyObject *
13228unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13229{
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013230 PyUnicodeObject *unicode, *self;
13231 Py_ssize_t length, char_size;
13232 int share_wstr, share_utf8;
13233 unsigned int kind;
13234 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013235
Benjamin Peterson14339b62009-01-31 16:36:08 +000013236 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013237
13238 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
13239 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013240 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013241 assert(_PyUnicode_CHECK(unicode));
Victor Stinnere06e1452011-10-04 20:52:31 +020013242 if (PyUnicode_READY(unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013243 return NULL;
13244
13245 self = (PyUnicodeObject *) type->tp_alloc(type, 0);
13246 if (self == NULL) {
13247 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013248 return NULL;
13249 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013250 kind = PyUnicode_KIND(unicode);
13251 length = PyUnicode_GET_LENGTH(unicode);
13252
13253 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013254#ifdef Py_DEBUG
13255 _PyUnicode_HASH(self) = -1;
13256#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013257 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013258#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013259 _PyUnicode_STATE(self).interned = 0;
13260 _PyUnicode_STATE(self).kind = kind;
13261 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013262 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013263 _PyUnicode_STATE(self).ready = 1;
13264 _PyUnicode_WSTR(self) = NULL;
13265 _PyUnicode_UTF8_LENGTH(self) = 0;
13266 _PyUnicode_UTF8(self) = NULL;
13267 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013268 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013269
13270 share_utf8 = 0;
13271 share_wstr = 0;
13272 if (kind == PyUnicode_1BYTE_KIND) {
13273 char_size = 1;
13274 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13275 share_utf8 = 1;
13276 }
13277 else if (kind == PyUnicode_2BYTE_KIND) {
13278 char_size = 2;
13279 if (sizeof(wchar_t) == 2)
13280 share_wstr = 1;
13281 }
13282 else {
13283 assert(kind == PyUnicode_4BYTE_KIND);
13284 char_size = 4;
13285 if (sizeof(wchar_t) == 4)
13286 share_wstr = 1;
13287 }
13288
13289 /* Ensure we won't overflow the length. */
13290 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13291 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013292 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013293 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013294 data = PyObject_MALLOC((length + 1) * char_size);
13295 if (data == NULL) {
13296 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013297 goto onError;
13298 }
13299
Victor Stinnerc3c74152011-10-02 20:39:55 +020013300 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013301 if (share_utf8) {
13302 _PyUnicode_UTF8_LENGTH(self) = length;
13303 _PyUnicode_UTF8(self) = data;
13304 }
13305 if (share_wstr) {
13306 _PyUnicode_WSTR_LENGTH(self) = length;
13307 _PyUnicode_WSTR(self) = (wchar_t *)data;
13308 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013309
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013310 Py_MEMCPY(data, PyUnicode_DATA(unicode),
13311 PyUnicode_KIND_SIZE(kind, length + 1));
13312 Py_DECREF(unicode);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013313 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013314#ifdef Py_DEBUG
13315 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13316#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013317 return (PyObject *)self;
13318
13319onError:
13320 Py_DECREF(unicode);
13321 Py_DECREF(self);
13322 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013323}
13324
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013325PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013326 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013327\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013328Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013329encoding defaults to the current default string encoding.\n\
13330errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013331
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013332static PyObject *unicode_iter(PyObject *seq);
13333
Guido van Rossumd57fd912000-03-10 22:53:23 +000013334PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013335 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013336 "str", /* tp_name */
13337 sizeof(PyUnicodeObject), /* tp_size */
13338 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013339 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013340 (destructor)unicode_dealloc, /* tp_dealloc */
13341 0, /* tp_print */
13342 0, /* tp_getattr */
13343 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013344 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013345 unicode_repr, /* tp_repr */
13346 &unicode_as_number, /* tp_as_number */
13347 &unicode_as_sequence, /* tp_as_sequence */
13348 &unicode_as_mapping, /* tp_as_mapping */
13349 (hashfunc) unicode_hash, /* tp_hash*/
13350 0, /* tp_call*/
13351 (reprfunc) unicode_str, /* tp_str */
13352 PyObject_GenericGetAttr, /* tp_getattro */
13353 0, /* tp_setattro */
13354 0, /* tp_as_buffer */
13355 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013356 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013357 unicode_doc, /* tp_doc */
13358 0, /* tp_traverse */
13359 0, /* tp_clear */
13360 PyUnicode_RichCompare, /* tp_richcompare */
13361 0, /* tp_weaklistoffset */
13362 unicode_iter, /* tp_iter */
13363 0, /* tp_iternext */
13364 unicode_methods, /* tp_methods */
13365 0, /* tp_members */
13366 0, /* tp_getset */
13367 &PyBaseObject_Type, /* tp_base */
13368 0, /* tp_dict */
13369 0, /* tp_descr_get */
13370 0, /* tp_descr_set */
13371 0, /* tp_dictoffset */
13372 0, /* tp_init */
13373 0, /* tp_alloc */
13374 unicode_new, /* tp_new */
13375 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013376};
13377
13378/* Initialize the Unicode implementation */
13379
Thomas Wouters78890102000-07-22 19:25:51 +000013380void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013381{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013382 int i;
13383
Thomas Wouters477c8d52006-05-27 19:21:47 +000013384 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013385 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013386 0x000A, /* LINE FEED */
13387 0x000D, /* CARRIAGE RETURN */
13388 0x001C, /* FILE SEPARATOR */
13389 0x001D, /* GROUP SEPARATOR */
13390 0x001E, /* RECORD SEPARATOR */
13391 0x0085, /* NEXT LINE */
13392 0x2028, /* LINE SEPARATOR */
13393 0x2029, /* PARAGRAPH SEPARATOR */
13394 };
13395
Fred Drakee4315f52000-05-09 19:53:39 +000013396 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013397 unicode_empty = PyUnicode_New(0, 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013398 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013399 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013400 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013401
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013402 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013403 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013404 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013405 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013406
13407 /* initialize the linebreak bloom filter */
13408 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013409 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013410 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013411
13412 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013413}
13414
13415/* Finalize the Unicode implementation */
13416
Christian Heimesa156e092008-02-16 07:38:31 +000013417int
13418PyUnicode_ClearFreeList(void)
13419{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013420 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013421}
13422
Guido van Rossumd57fd912000-03-10 22:53:23 +000013423void
Thomas Wouters78890102000-07-22 19:25:51 +000013424_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013425{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013426 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013427
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013428 Py_XDECREF(unicode_empty);
13429 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013430
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013431 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013432 if (unicode_latin1[i]) {
13433 Py_DECREF(unicode_latin1[i]);
13434 unicode_latin1[i] = NULL;
13435 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013436 }
Christian Heimesa156e092008-02-16 07:38:31 +000013437 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013438}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013439
Walter Dörwald16807132007-05-25 13:52:07 +000013440void
13441PyUnicode_InternInPlace(PyObject **p)
13442{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013443 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
13444 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013445#ifdef Py_DEBUG
13446 assert(s != NULL);
13447 assert(_PyUnicode_CHECK(s));
13448#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013449 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013450 return;
13451#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013452 /* If it's a subclass, we don't really know what putting
13453 it in the interned dict might do. */
13454 if (!PyUnicode_CheckExact(s))
13455 return;
13456 if (PyUnicode_CHECK_INTERNED(s))
13457 return;
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013458 if (_PyUnicode_READY_REPLACE(p)) {
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013459 assert(0 && "_PyUnicode_READY_REPLACE fail in PyUnicode_InternInPlace");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013460 return;
13461 }
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013462 s = (PyUnicodeObject *)(*p);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013463 if (interned == NULL) {
13464 interned = PyDict_New();
13465 if (interned == NULL) {
13466 PyErr_Clear(); /* Don't leave an exception */
13467 return;
13468 }
13469 }
13470 /* It might be that the GetItem call fails even
13471 though the key is present in the dictionary,
13472 namely when this happens during a stack overflow. */
13473 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000013474 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013475 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000013476
Benjamin Peterson29060642009-01-31 22:14:21 +000013477 if (t) {
13478 Py_INCREF(t);
13479 Py_DECREF(*p);
13480 *p = t;
13481 return;
13482 }
Walter Dörwald16807132007-05-25 13:52:07 +000013483
Benjamin Peterson14339b62009-01-31 16:36:08 +000013484 PyThreadState_GET()->recursion_critical = 1;
13485 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
13486 PyErr_Clear();
13487 PyThreadState_GET()->recursion_critical = 0;
13488 return;
13489 }
13490 PyThreadState_GET()->recursion_critical = 0;
13491 /* The two references in interned are not counted by refcnt.
13492 The deallocator will take care of this */
13493 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013494 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013495}
13496
13497void
13498PyUnicode_InternImmortal(PyObject **p)
13499{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013500 PyUnicodeObject *u = (PyUnicodeObject *)*p;
13501
Benjamin Peterson14339b62009-01-31 16:36:08 +000013502 PyUnicode_InternInPlace(p);
13503 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013504 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013505 Py_INCREF(*p);
13506 }
Walter Dörwald16807132007-05-25 13:52:07 +000013507}
13508
13509PyObject *
13510PyUnicode_InternFromString(const char *cp)
13511{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013512 PyObject *s = PyUnicode_FromString(cp);
13513 if (s == NULL)
13514 return NULL;
13515 PyUnicode_InternInPlace(&s);
13516 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000013517}
13518
Alexander Belopolsky40018472011-02-26 01:02:56 +000013519void
13520_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000013521{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013522 PyObject *keys;
13523 PyUnicodeObject *s;
13524 Py_ssize_t i, n;
13525 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000013526
Benjamin Peterson14339b62009-01-31 16:36:08 +000013527 if (interned == NULL || !PyDict_Check(interned))
13528 return;
13529 keys = PyDict_Keys(interned);
13530 if (keys == NULL || !PyList_Check(keys)) {
13531 PyErr_Clear();
13532 return;
13533 }
Walter Dörwald16807132007-05-25 13:52:07 +000013534
Benjamin Peterson14339b62009-01-31 16:36:08 +000013535 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13536 detector, interned unicode strings are not forcibly deallocated;
13537 rather, we give them their stolen references back, and then clear
13538 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000013539
Benjamin Peterson14339b62009-01-31 16:36:08 +000013540 n = PyList_GET_SIZE(keys);
13541 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000013542 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013543 for (i = 0; i < n; i++) {
13544 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013545 if (PyUnicode_READY(s) == -1) {
13546 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013547 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013548 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013549 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013550 case SSTATE_NOT_INTERNED:
13551 /* XXX Shouldn't happen */
13552 break;
13553 case SSTATE_INTERNED_IMMORTAL:
13554 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013555 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013556 break;
13557 case SSTATE_INTERNED_MORTAL:
13558 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013559 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013560 break;
13561 default:
13562 Py_FatalError("Inconsistent interned string state.");
13563 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013564 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013565 }
13566 fprintf(stderr, "total size of all interned strings: "
13567 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13568 "mortal/immortal\n", mortal_size, immortal_size);
13569 Py_DECREF(keys);
13570 PyDict_Clear(interned);
13571 Py_DECREF(interned);
13572 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000013573}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013574
13575
13576/********************* Unicode Iterator **************************/
13577
13578typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013579 PyObject_HEAD
13580 Py_ssize_t it_index;
13581 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013582} unicodeiterobject;
13583
13584static void
13585unicodeiter_dealloc(unicodeiterobject *it)
13586{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013587 _PyObject_GC_UNTRACK(it);
13588 Py_XDECREF(it->it_seq);
13589 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013590}
13591
13592static int
13593unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13594{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013595 Py_VISIT(it->it_seq);
13596 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013597}
13598
13599static PyObject *
13600unicodeiter_next(unicodeiterobject *it)
13601{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013602 PyUnicodeObject *seq;
13603 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013604
Benjamin Peterson14339b62009-01-31 16:36:08 +000013605 assert(it != NULL);
13606 seq = it->it_seq;
13607 if (seq == NULL)
13608 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013609 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013610
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013611 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
13612 int kind = PyUnicode_KIND(seq);
13613 void *data = PyUnicode_DATA(seq);
13614 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
13615 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013616 if (item != NULL)
13617 ++it->it_index;
13618 return item;
13619 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013620
Benjamin Peterson14339b62009-01-31 16:36:08 +000013621 Py_DECREF(seq);
13622 it->it_seq = NULL;
13623 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013624}
13625
13626static PyObject *
13627unicodeiter_len(unicodeiterobject *it)
13628{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013629 Py_ssize_t len = 0;
13630 if (it->it_seq)
13631 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
13632 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013633}
13634
13635PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
13636
13637static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013638 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000013639 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000013640 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013641};
13642
13643PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013644 PyVarObject_HEAD_INIT(&PyType_Type, 0)
13645 "str_iterator", /* tp_name */
13646 sizeof(unicodeiterobject), /* tp_basicsize */
13647 0, /* tp_itemsize */
13648 /* methods */
13649 (destructor)unicodeiter_dealloc, /* tp_dealloc */
13650 0, /* tp_print */
13651 0, /* tp_getattr */
13652 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013653 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013654 0, /* tp_repr */
13655 0, /* tp_as_number */
13656 0, /* tp_as_sequence */
13657 0, /* tp_as_mapping */
13658 0, /* tp_hash */
13659 0, /* tp_call */
13660 0, /* tp_str */
13661 PyObject_GenericGetAttr, /* tp_getattro */
13662 0, /* tp_setattro */
13663 0, /* tp_as_buffer */
13664 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
13665 0, /* tp_doc */
13666 (traverseproc)unicodeiter_traverse, /* tp_traverse */
13667 0, /* tp_clear */
13668 0, /* tp_richcompare */
13669 0, /* tp_weaklistoffset */
13670 PyObject_SelfIter, /* tp_iter */
13671 (iternextfunc)unicodeiter_next, /* tp_iternext */
13672 unicodeiter_methods, /* tp_methods */
13673 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013674};
13675
13676static PyObject *
13677unicode_iter(PyObject *seq)
13678{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013679 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013680
Benjamin Peterson14339b62009-01-31 16:36:08 +000013681 if (!PyUnicode_Check(seq)) {
13682 PyErr_BadInternalCall();
13683 return NULL;
13684 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013685 if (PyUnicode_READY(seq) == -1)
13686 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013687 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
13688 if (it == NULL)
13689 return NULL;
13690 it->it_index = 0;
13691 Py_INCREF(seq);
13692 it->it_seq = (PyUnicodeObject *)seq;
13693 _PyObject_GC_TRACK(it);
13694 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013695}
13696
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013697#define UNIOP(x) Py_UNICODE_##x
13698#define UNIOP_t Py_UNICODE
13699#include "uniops.h"
13700#undef UNIOP
13701#undef UNIOP_t
13702#define UNIOP(x) Py_UCS4_##x
13703#define UNIOP_t Py_UCS4
13704#include "uniops.h"
13705#undef UNIOP
13706#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000013707
Victor Stinner71133ff2010-09-01 23:43:53 +000013708Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000013709PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000013710{
13711 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
13712 Py_UNICODE *copy;
13713 Py_ssize_t size;
13714
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013715 if (!PyUnicode_Check(unicode)) {
13716 PyErr_BadArgument();
13717 return NULL;
13718 }
Victor Stinner71133ff2010-09-01 23:43:53 +000013719 /* Ensure we won't overflow the size. */
13720 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
13721 PyErr_NoMemory();
13722 return NULL;
13723 }
13724 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
13725 size *= sizeof(Py_UNICODE);
13726 copy = PyMem_Malloc(size);
13727 if (copy == NULL) {
13728 PyErr_NoMemory();
13729 return NULL;
13730 }
13731 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
13732 return copy;
13733}
Martin v. Löwis5b222132007-06-10 09:51:05 +000013734
Georg Brandl66c221e2010-10-14 07:04:07 +000013735/* A _string module, to export formatter_parser and formatter_field_name_split
13736 to the string.Formatter class implemented in Python. */
13737
13738static PyMethodDef _string_methods[] = {
13739 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
13740 METH_O, PyDoc_STR("split the argument as a field name")},
13741 {"formatter_parser", (PyCFunction) formatter_parser,
13742 METH_O, PyDoc_STR("parse the argument as a format string")},
13743 {NULL, NULL}
13744};
13745
13746static struct PyModuleDef _string_module = {
13747 PyModuleDef_HEAD_INIT,
13748 "_string",
13749 PyDoc_STR("string helper module"),
13750 0,
13751 _string_methods,
13752 NULL,
13753 NULL,
13754 NULL,
13755 NULL
13756};
13757
13758PyMODINIT_FUNC
13759PyInit__string(void)
13760{
13761 return PyModule_Create(&_string_module);
13762}
13763
13764
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013765#ifdef __cplusplus
13766}
13767#endif