blob: 75fc23c795b042d5a8ffaf275659e3559ee50728 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Victor Stinnerce5faf62011-10-05 00:42:43 +020049#ifdef Py_DEBUG
50# define DONT_MAKE_RESULT_READY
51#endif
52
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Victor Stinner910337b2011-10-03 03:20:16 +020096#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020097# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020098#else
99# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
100#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200101
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102#define _PyUnicode_UTF8(op) \
103 (((PyCompactUnicodeObject*)(op))->utf8)
104#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200105 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200106 assert(PyUnicode_IS_READY(op)), \
107 PyUnicode_IS_COMPACT_ASCII(op) ? \
108 ((char*)((PyASCIIObject*)(op) + 1)) : \
109 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200110#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200111 (((PyCompactUnicodeObject*)(op))->utf8_length)
112#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200113 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200114 assert(PyUnicode_IS_READY(op)), \
115 PyUnicode_IS_COMPACT_ASCII(op) ? \
116 ((PyASCIIObject*)(op))->length : \
117 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200118#define _PyUnicode_WSTR(op) \
119 (((PyASCIIObject*)(op))->wstr)
120#define _PyUnicode_WSTR_LENGTH(op) \
121 (((PyCompactUnicodeObject*)(op))->wstr_length)
122#define _PyUnicode_LENGTH(op) \
123 (((PyASCIIObject *)(op))->length)
124#define _PyUnicode_STATE(op) \
125 (((PyASCIIObject *)(op))->state)
126#define _PyUnicode_HASH(op) \
127 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200128#define _PyUnicode_KIND(op) \
129 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200130 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200131#define _PyUnicode_GET_LENGTH(op) \
132 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200133 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200134#define _PyUnicode_DATA_ANY(op) \
135 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200136
Victor Stinner910337b2011-10-03 03:20:16 +0200137#undef PyUnicode_READY
138#define PyUnicode_READY(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200141 0 : \
142 _PyUnicode_Ready((PyObject *)(op))))
Victor Stinner910337b2011-10-03 03:20:16 +0200143
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200144#define _PyUnicode_READY_REPLACE(p_obj) \
145 (assert(_PyUnicode_CHECK(*p_obj)), \
146 (PyUnicode_IS_READY(*p_obj) ? \
147 0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj))))
148
Victor Stinnerc379ead2011-10-03 12:52:27 +0200149#define _PyUnicode_SHARE_UTF8(op) \
150 (assert(_PyUnicode_CHECK(op)), \
151 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
152 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
153#define _PyUnicode_SHARE_WSTR(op) \
154 (assert(_PyUnicode_CHECK(op)), \
155 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
156
Victor Stinner829c0ad2011-10-03 01:08:02 +0200157/* true if the Unicode object has an allocated UTF-8 memory block
158 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200159#define _PyUnicode_HAS_UTF8_MEMORY(op) \
160 (assert(_PyUnicode_CHECK(op)), \
161 (!PyUnicode_IS_COMPACT_ASCII(op) \
162 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200163 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
164
Victor Stinner03490912011-10-03 23:45:12 +0200165/* true if the Unicode object has an allocated wstr memory block
166 (not shared with other data) */
167#define _PyUnicode_HAS_WSTR_MEMORY(op) \
168 (assert(_PyUnicode_CHECK(op)), \
169 (_PyUnicode_WSTR(op) && \
170 (!PyUnicode_IS_READY(op) || \
171 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
172
Victor Stinner910337b2011-10-03 03:20:16 +0200173/* Generic helper macro to convert characters of different types.
174 from_type and to_type have to be valid type names, begin and end
175 are pointers to the source characters which should be of type
176 "from_type *". to is a pointer of type "to_type *" and points to the
177 buffer where the result characters are written to. */
178#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
179 do { \
180 const from_type *iter_; to_type *to_; \
181 for (iter_ = (begin), to_ = (to_type *)(to); \
182 iter_ < (end); \
183 ++iter_, ++to_) { \
184 *to_ = (to_type)*iter_; \
185 } \
186 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200187
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200188/* The Unicode string has been modified: reset the hash */
189#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
190
Walter Dörwald16807132007-05-25 13:52:07 +0000191/* This dictionary holds all interned unicode strings. Note that references
192 to strings in this dictionary are *not* counted in the string's ob_refcnt.
193 When the interned string reaches a refcnt of 0 the string deallocation
194 function will delete the reference from this dictionary.
195
196 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000197 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000198*/
199static PyObject *interned;
200
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000201/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200202static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000203
204/* Single character Unicode strings in the Latin-1 range are being
205 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200206static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000207
Christian Heimes190d79e2008-01-30 11:58:22 +0000208/* Fast detection of the most frequent whitespace characters */
209const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000211/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000212/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000213/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000214/* case 0x000C: * FORM FEED */
215/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000216 0, 1, 1, 1, 1, 1, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000218/* case 0x001C: * FILE SEPARATOR */
219/* case 0x001D: * GROUP SEPARATOR */
220/* case 0x001E: * RECORD SEPARATOR */
221/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000222 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000223/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000224 1, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000228
Benjamin Peterson14339b62009-01-31 16:36:08 +0000229 0, 0, 0, 0, 0, 0, 0, 0,
230 0, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0,
232 0, 0, 0, 0, 0, 0, 0, 0,
233 0, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000237};
238
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200239/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200240static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200241static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200242static void copy_characters(
243 PyObject *to, Py_ssize_t to_start,
244 PyObject *from, Py_ssize_t from_start,
245 Py_ssize_t how_many);
Victor Stinnerc729b8e2011-10-06 02:36:59 +0200246#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200247static int unicode_is_singleton(PyObject *unicode);
Victor Stinnerc729b8e2011-10-06 02:36:59 +0200248#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200249
Alexander Belopolsky40018472011-02-26 01:02:56 +0000250static PyObject *
251unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000252 PyObject **errorHandler,const char *encoding, const char *reason,
253 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
254 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
255
Alexander Belopolsky40018472011-02-26 01:02:56 +0000256static void
257raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300258 const char *encoding,
259 const Py_UNICODE *unicode, Py_ssize_t size,
260 Py_ssize_t startpos, Py_ssize_t endpos,
261 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000262
Christian Heimes190d79e2008-01-30 11:58:22 +0000263/* Same for linebreaks */
264static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000265 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000266/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000267/* 0x000B, * LINE TABULATION */
268/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000269/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000270 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000271 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000272/* 0x001C, * FILE SEPARATOR */
273/* 0x001D, * GROUP SEPARATOR */
274/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000275 0, 0, 0, 0, 1, 1, 1, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
277 0, 0, 0, 0, 0, 0, 0, 0,
278 0, 0, 0, 0, 0, 0, 0, 0,
279 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000280
Benjamin Peterson14339b62009-01-31 16:36:08 +0000281 0, 0, 0, 0, 0, 0, 0, 0,
282 0, 0, 0, 0, 0, 0, 0, 0,
283 0, 0, 0, 0, 0, 0, 0, 0,
284 0, 0, 0, 0, 0, 0, 0, 0,
285 0, 0, 0, 0, 0, 0, 0, 0,
286 0, 0, 0, 0, 0, 0, 0, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000289};
290
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300291/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
292 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000293Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000294PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000295{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000296#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000297 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000298#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000299 /* This is actually an illegal character, so it should
300 not be passed to unichr. */
301 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000302#endif
303}
304
Victor Stinner910337b2011-10-03 03:20:16 +0200305#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200306int
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200307/* FIXME: use PyObject* type for op */
308_PyUnicode_CheckConsistency(void *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200309{
310 PyASCIIObject *ascii;
311 unsigned int kind;
312
313 assert(PyUnicode_Check(op));
314
315 ascii = (PyASCIIObject *)op;
316 kind = ascii->state.kind;
317
Victor Stinnera3b334d2011-10-03 13:53:37 +0200318 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200319 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200320 assert(ascii->state.ready == 1);
321 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200322 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200323 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200324 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200325
Victor Stinnera41463c2011-10-04 01:05:08 +0200326 if (ascii->state.compact == 1) {
327 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200328 assert(kind == PyUnicode_1BYTE_KIND
329 || kind == PyUnicode_2BYTE_KIND
330 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200331 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200332 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200333 assert (compact->utf8 != data);
334 } else {
335 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
336
337 data = unicode->data.any;
338 if (kind == PyUnicode_WCHAR_KIND) {
339 assert(ascii->state.compact == 0);
340 assert(ascii->state.ascii == 0);
341 assert(ascii->state.ready == 0);
342 assert(ascii->wstr != NULL);
343 assert(data == NULL);
344 assert(compact->utf8 == NULL);
345 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
346 }
347 else {
348 assert(kind == PyUnicode_1BYTE_KIND
349 || kind == PyUnicode_2BYTE_KIND
350 || kind == PyUnicode_4BYTE_KIND);
351 assert(ascii->state.compact == 0);
352 assert(ascii->state.ready == 1);
353 assert(data != NULL);
354 if (ascii->state.ascii) {
355 assert (compact->utf8 == data);
356 assert (compact->utf8_length == ascii->length);
357 }
358 else
359 assert (compact->utf8 != data);
360 }
361 }
362 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200363 if (
364#if SIZEOF_WCHAR_T == 2
365 kind == PyUnicode_2BYTE_KIND
366#else
367 kind == PyUnicode_4BYTE_KIND
368#endif
369 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200370 {
371 assert(ascii->wstr == data);
372 assert(compact->wstr_length == ascii->length);
373 } else
374 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200375 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200376
377 if (compact->utf8 == NULL)
378 assert(compact->utf8_length == 0);
379 if (ascii->wstr == NULL)
380 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200381 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200382 /* check that the best kind is used */
383 if (check_content && kind != PyUnicode_WCHAR_KIND)
384 {
385 Py_ssize_t i;
386 Py_UCS4 maxchar = 0;
387 void *data = PyUnicode_DATA(ascii);
388 for (i=0; i < ascii->length; i++)
389 {
390 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
391 if (ch > maxchar)
392 maxchar = ch;
393 }
394 if (kind == PyUnicode_1BYTE_KIND) {
395 if (ascii->state.ascii == 0)
396 assert(maxchar >= 128);
397 else
398 assert(maxchar < 128);
399 }
400 else if (kind == PyUnicode_2BYTE_KIND)
401 assert(maxchar >= 0x100);
402 else
403 assert(maxchar >= 0x10000);
404 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200405 if (check_content && !unicode_is_singleton((PyObject*)ascii))
406 assert(ascii->hash == -1);
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400407 return 1;
408}
Victor Stinner910337b2011-10-03 03:20:16 +0200409#endif
410
Thomas Wouters477c8d52006-05-27 19:21:47 +0000411/* --- Bloom Filters ----------------------------------------------------- */
412
413/* stuff to implement simple "bloom filters" for Unicode characters.
414 to keep things simple, we use a single bitmask, using the least 5
415 bits from each unicode characters as the bit index. */
416
417/* the linebreak mask is set up by Unicode_Init below */
418
Antoine Pitrouf068f942010-01-13 14:19:12 +0000419#if LONG_BIT >= 128
420#define BLOOM_WIDTH 128
421#elif LONG_BIT >= 64
422#define BLOOM_WIDTH 64
423#elif LONG_BIT >= 32
424#define BLOOM_WIDTH 32
425#else
426#error "LONG_BIT is smaller than 32"
427#endif
428
Thomas Wouters477c8d52006-05-27 19:21:47 +0000429#define BLOOM_MASK unsigned long
430
431static BLOOM_MASK bloom_linebreak;
432
Antoine Pitrouf068f942010-01-13 14:19:12 +0000433#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
434#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000435
Benjamin Peterson29060642009-01-31 22:14:21 +0000436#define BLOOM_LINEBREAK(ch) \
437 ((ch) < 128U ? ascii_linebreak[(ch)] : \
438 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000439
Alexander Belopolsky40018472011-02-26 01:02:56 +0000440Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200441make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000442{
443 /* calculate simple bloom-style bitmask for a given unicode string */
444
Antoine Pitrouf068f942010-01-13 14:19:12 +0000445 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000446 Py_ssize_t i;
447
448 mask = 0;
449 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200450 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000451
452 return mask;
453}
454
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200455#define BLOOM_MEMBER(mask, chr, str) \
456 (BLOOM(mask, chr) \
457 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000458
Guido van Rossumd57fd912000-03-10 22:53:23 +0000459/* --- Unicode Object ----------------------------------------------------- */
460
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200461static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200462fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200463
464Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
465 Py_ssize_t size, Py_UCS4 ch,
466 int direction)
467{
468 /* like wcschr, but doesn't stop at NULL characters */
469 Py_ssize_t i;
470 if (direction == 1) {
471 for(i = 0; i < size; i++)
472 if (PyUnicode_READ(kind, s, i) == ch)
473 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
474 }
475 else {
476 for(i = size-1; i >= 0; i--)
477 if (PyUnicode_READ(kind, s, i) == ch)
478 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
479 }
480 return NULL;
481}
482
Victor Stinnerfe226c02011-10-03 03:52:20 +0200483static PyObject*
484resize_compact(PyObject *unicode, Py_ssize_t length)
485{
486 Py_ssize_t char_size;
487 Py_ssize_t struct_size;
488 Py_ssize_t new_size;
489 int share_wstr;
490
491 assert(PyUnicode_IS_READY(unicode));
492 char_size = PyUnicode_CHARACTER_SIZE(unicode);
493 if (PyUnicode_IS_COMPACT_ASCII(unicode))
494 struct_size = sizeof(PyASCIIObject);
495 else
496 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200497 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200498
499 _Py_DEC_REFTOTAL;
500 _Py_ForgetReference(unicode);
501
502 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
503 PyErr_NoMemory();
504 return NULL;
505 }
506 new_size = (struct_size + (length + 1) * char_size);
507
508 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
509 if (unicode == NULL) {
510 PyObject_Del(unicode);
511 PyErr_NoMemory();
512 return NULL;
513 }
514 _Py_NewReference(unicode);
515 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200516 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200517 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200518 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
519 _PyUnicode_WSTR_LENGTH(unicode) = length;
520 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200521 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
522 length, 0);
523 return unicode;
524}
525
Alexander Belopolsky40018472011-02-26 01:02:56 +0000526static int
Victor Stinner95663112011-10-04 01:03:50 +0200527resize_inplace(PyUnicodeObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000528{
Victor Stinner95663112011-10-04 01:03:50 +0200529 wchar_t *wstr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200530 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200531 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000532
Victor Stinner95663112011-10-04 01:03:50 +0200533 _PyUnicode_DIRTY(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200534
535 if (PyUnicode_IS_READY(unicode)) {
536 Py_ssize_t char_size;
537 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200538 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200539 void *data;
540
541 data = _PyUnicode_DATA_ANY(unicode);
542 assert(data != NULL);
543 char_size = PyUnicode_CHARACTER_SIZE(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200544 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
545 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinner95663112011-10-04 01:03:50 +0200546 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
547 {
548 PyObject_DEL(_PyUnicode_UTF8(unicode));
549 _PyUnicode_UTF8(unicode) = NULL;
550 _PyUnicode_UTF8_LENGTH(unicode) = 0;
551 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200552
553 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
554 PyErr_NoMemory();
555 return -1;
556 }
557 new_size = (length + 1) * char_size;
558
559 data = (PyObject *)PyObject_REALLOC(data, new_size);
560 if (data == NULL) {
561 PyErr_NoMemory();
562 return -1;
563 }
564 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200565 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200566 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200567 _PyUnicode_WSTR_LENGTH(unicode) = length;
568 }
569 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200570 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200571 _PyUnicode_UTF8_LENGTH(unicode) = length;
572 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200573 _PyUnicode_LENGTH(unicode) = length;
574 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200575 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200576 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200577 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200578 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200579 }
Victor Stinner95663112011-10-04 01:03:50 +0200580 assert(_PyUnicode_WSTR(unicode) != NULL);
581
582 /* check for integer overflow */
583 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
584 PyErr_NoMemory();
585 return -1;
586 }
587 wstr = _PyUnicode_WSTR(unicode);
588 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
589 if (!wstr) {
590 PyErr_NoMemory();
591 return -1;
592 }
593 _PyUnicode_WSTR(unicode) = wstr;
594 _PyUnicode_WSTR(unicode)[length] = 0;
595 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200596 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000597 return 0;
598}
599
Victor Stinnerfe226c02011-10-03 03:52:20 +0200600static PyObject*
601resize_copy(PyObject *unicode, Py_ssize_t length)
602{
603 Py_ssize_t copy_length;
604 if (PyUnicode_IS_COMPACT(unicode)) {
605 PyObject *copy;
606 assert(PyUnicode_IS_READY(unicode));
607
608 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
609 if (copy == NULL)
610 return NULL;
611
612 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200613 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200614 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200615 }
616 else {
Victor Stinner2fd82272011-10-03 04:06:05 +0200617 PyUnicodeObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200618 assert(_PyUnicode_WSTR(unicode) != NULL);
619 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner2fd82272011-10-03 04:06:05 +0200620 w = _PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200621 if (w == NULL)
622 return NULL;
623 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
624 copy_length = Py_MIN(copy_length, length);
625 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
626 copy_length);
627 return (PyObject*)w;
628 }
629}
630
Guido van Rossumd57fd912000-03-10 22:53:23 +0000631/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000632 Ux0000 terminated; some code (e.g. new_identifier)
633 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000634
635 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000636 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000637
638*/
639
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200640#ifdef Py_DEBUG
641int unicode_old_new_calls = 0;
642#endif
643
Alexander Belopolsky40018472011-02-26 01:02:56 +0000644static PyUnicodeObject *
645_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000646{
647 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200648 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000649
Thomas Wouters477c8d52006-05-27 19:21:47 +0000650 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000651 if (length == 0 && unicode_empty != NULL) {
652 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200653 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000654 }
655
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000656 /* Ensure we won't overflow the size. */
657 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
658 return (PyUnicodeObject *)PyErr_NoMemory();
659 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200660 if (length < 0) {
661 PyErr_SetString(PyExc_SystemError,
662 "Negative size passed to _PyUnicode_New");
663 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000664 }
665
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200666#ifdef Py_DEBUG
667 ++unicode_old_new_calls;
668#endif
669
670 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
671 if (unicode == NULL)
672 return NULL;
673 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
674 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
675 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000676 PyErr_NoMemory();
677 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000678 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200679
Jeremy Hyltond8082792003-09-16 19:41:39 +0000680 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000681 * the caller fails before initializing str -- unicode_resize()
682 * reads str[0], and the Keep-Alive optimization can keep memory
683 * allocated for str alive across a call to unicode_dealloc(unicode).
684 * We don't want unicode_resize to read uninitialized memory in
685 * that case.
686 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200687 _PyUnicode_WSTR(unicode)[0] = 0;
688 _PyUnicode_WSTR(unicode)[length] = 0;
689 _PyUnicode_WSTR_LENGTH(unicode) = length;
690 _PyUnicode_HASH(unicode) = -1;
691 _PyUnicode_STATE(unicode).interned = 0;
692 _PyUnicode_STATE(unicode).kind = 0;
693 _PyUnicode_STATE(unicode).compact = 0;
694 _PyUnicode_STATE(unicode).ready = 0;
695 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200696 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200697 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200698 _PyUnicode_UTF8(unicode) = NULL;
699 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000700 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000701
Benjamin Peterson29060642009-01-31 22:14:21 +0000702 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000703 /* XXX UNREF/NEWREF interface should be more symmetrical */
704 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000705 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000706 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000707 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000708}
709
Victor Stinnerf42dc442011-10-02 23:33:16 +0200710static const char*
711unicode_kind_name(PyObject *unicode)
712{
Victor Stinner42dfd712011-10-03 14:41:45 +0200713 /* don't check consistency: unicode_kind_name() is called from
714 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200715 if (!PyUnicode_IS_COMPACT(unicode))
716 {
717 if (!PyUnicode_IS_READY(unicode))
718 return "wstr";
719 switch(PyUnicode_KIND(unicode))
720 {
721 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200722 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200723 return "legacy ascii";
724 else
725 return "legacy latin1";
726 case PyUnicode_2BYTE_KIND:
727 return "legacy UCS2";
728 case PyUnicode_4BYTE_KIND:
729 return "legacy UCS4";
730 default:
731 return "<legacy invalid kind>";
732 }
733 }
734 assert(PyUnicode_IS_READY(unicode));
735 switch(PyUnicode_KIND(unicode))
736 {
737 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200738 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200739 return "ascii";
740 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200741 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200742 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200743 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200744 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200745 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200746 default:
747 return "<invalid compact kind>";
748 }
749}
750
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200751#ifdef Py_DEBUG
752int unicode_new_new_calls = 0;
753
754/* Functions wrapping macros for use in debugger */
755char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200756 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200757}
758
759void *_PyUnicode_compact_data(void *unicode) {
760 return _PyUnicode_COMPACT_DATA(unicode);
761}
762void *_PyUnicode_data(void *unicode){
763 printf("obj %p\n", unicode);
764 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
765 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
766 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
767 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
768 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
769 return PyUnicode_DATA(unicode);
770}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200771
772void
773_PyUnicode_Dump(PyObject *op)
774{
775 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200776 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
777 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
778 void *data;
779 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
780 if (ascii->state.compact)
781 data = (compact + 1);
782 else
783 data = unicode->data.any;
784 if (ascii->wstr == data)
785 printf("shared ");
786 printf("wstr=%p", ascii->wstr);
Victor Stinnera3b334d2011-10-03 13:53:37 +0200787 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200788 printf(" (%zu), ", compact->wstr_length);
789 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
790 printf("shared ");
791 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200792 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200793 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200794}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200795#endif
796
797PyObject *
798PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
799{
800 PyObject *obj;
801 PyCompactUnicodeObject *unicode;
802 void *data;
803 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200804 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200805 Py_ssize_t char_size;
806 Py_ssize_t struct_size;
807
808 /* Optimization for empty strings */
809 if (size == 0 && unicode_empty != NULL) {
810 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200811 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200812 }
813
814#ifdef Py_DEBUG
815 ++unicode_new_new_calls;
816#endif
817
Victor Stinner9e9d6892011-10-04 01:02:02 +0200818 is_ascii = 0;
819 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200820 struct_size = sizeof(PyCompactUnicodeObject);
821 if (maxchar < 128) {
822 kind_state = PyUnicode_1BYTE_KIND;
823 char_size = 1;
824 is_ascii = 1;
825 struct_size = sizeof(PyASCIIObject);
826 }
827 else if (maxchar < 256) {
828 kind_state = PyUnicode_1BYTE_KIND;
829 char_size = 1;
830 }
831 else if (maxchar < 65536) {
832 kind_state = PyUnicode_2BYTE_KIND;
833 char_size = 2;
834 if (sizeof(wchar_t) == 2)
835 is_sharing = 1;
836 }
837 else {
838 kind_state = PyUnicode_4BYTE_KIND;
839 char_size = 4;
840 if (sizeof(wchar_t) == 4)
841 is_sharing = 1;
842 }
843
844 /* Ensure we won't overflow the size. */
845 if (size < 0) {
846 PyErr_SetString(PyExc_SystemError,
847 "Negative size passed to PyUnicode_New");
848 return NULL;
849 }
850 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
851 return PyErr_NoMemory();
852
853 /* Duplicated allocation code from _PyObject_New() instead of a call to
854 * PyObject_New() so we are able to allocate space for the object and
855 * it's data buffer.
856 */
857 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
858 if (obj == NULL)
859 return PyErr_NoMemory();
860 obj = PyObject_INIT(obj, &PyUnicode_Type);
861 if (obj == NULL)
862 return NULL;
863
864 unicode = (PyCompactUnicodeObject *)obj;
865 if (is_ascii)
866 data = ((PyASCIIObject*)obj) + 1;
867 else
868 data = unicode + 1;
869 _PyUnicode_LENGTH(unicode) = size;
870 _PyUnicode_HASH(unicode) = -1;
871 _PyUnicode_STATE(unicode).interned = 0;
872 _PyUnicode_STATE(unicode).kind = kind_state;
873 _PyUnicode_STATE(unicode).compact = 1;
874 _PyUnicode_STATE(unicode).ready = 1;
875 _PyUnicode_STATE(unicode).ascii = is_ascii;
876 if (is_ascii) {
877 ((char*)data)[size] = 0;
878 _PyUnicode_WSTR(unicode) = NULL;
879 }
880 else if (kind_state == PyUnicode_1BYTE_KIND) {
881 ((char*)data)[size] = 0;
882 _PyUnicode_WSTR(unicode) = NULL;
883 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200884 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200885 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200886 }
887 else {
888 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200889 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200890 if (kind_state == PyUnicode_2BYTE_KIND)
891 ((Py_UCS2*)data)[size] = 0;
892 else /* kind_state == PyUnicode_4BYTE_KIND */
893 ((Py_UCS4*)data)[size] = 0;
894 if (is_sharing) {
895 _PyUnicode_WSTR_LENGTH(unicode) = size;
896 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
897 }
898 else {
899 _PyUnicode_WSTR_LENGTH(unicode) = 0;
900 _PyUnicode_WSTR(unicode) = NULL;
901 }
902 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200903 assert(_PyUnicode_CheckConsistency(unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200904 return obj;
905}
906
907#if SIZEOF_WCHAR_T == 2
908/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
909 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +0200910 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200911
912 This function assumes that unicode can hold one more code point than wstr
913 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +0200914static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200915unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
916 PyUnicodeObject *unicode)
917{
918 const wchar_t *iter;
919 Py_UCS4 *ucs4_out;
920
Victor Stinner910337b2011-10-03 03:20:16 +0200921 assert(unicode != NULL);
922 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200923 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
924 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
925
926 for (iter = begin; iter < end; ) {
927 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
928 _PyUnicode_GET_LENGTH(unicode)));
929 if (*iter >= 0xD800 && *iter <= 0xDBFF
930 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
931 {
932 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
933 iter += 2;
934 }
935 else {
936 *ucs4_out++ = *iter;
937 iter++;
938 }
939 }
940 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
941 _PyUnicode_GET_LENGTH(unicode)));
942
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200943}
944#endif
945
Victor Stinnercd9950f2011-10-02 00:34:53 +0200946static int
947_PyUnicode_Dirty(PyObject *unicode)
948{
Victor Stinner910337b2011-10-03 03:20:16 +0200949 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +0200950 if (Py_REFCNT(unicode) != 1) {
Victor Stinner01698042011-10-04 00:04:26 +0200951 PyErr_SetString(PyExc_SystemError,
Victor Stinnercd9950f2011-10-02 00:34:53 +0200952 "Cannot modify a string having more than 1 reference");
953 return -1;
954 }
955 _PyUnicode_DIRTY(unicode);
956 return 0;
957}
958
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200959static int
960_copy_characters(PyObject *to, Py_ssize_t to_start,
961 PyObject *from, Py_ssize_t from_start,
962 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200963{
Victor Stinnera0702ab2011-09-29 14:14:38 +0200964 unsigned int from_kind, to_kind;
965 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200966 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200967
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200968 assert(PyUnicode_Check(from));
969 assert(PyUnicode_Check(to));
970 assert(PyUnicode_IS_READY(from));
971 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200972
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200973 assert(PyUnicode_GET_LENGTH(from) >= how_many);
974 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
975 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200976
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200977 if (how_many == 0)
978 return 0;
979
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200980 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200981 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200982 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200983 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200984
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200985#ifdef Py_DEBUG
986 if (!check_maxchar
987 && (from_kind > to_kind
988 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200989 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200990 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
991 Py_UCS4 ch;
992 Py_ssize_t i;
993 for (i=0; i < how_many; i++) {
994 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
995 assert(ch <= to_maxchar);
996 }
997 }
998#endif
999 fast = (from_kind == to_kind);
1000 if (check_maxchar
1001 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1002 {
1003 /* deny latin1 => ascii */
1004 fast = 0;
1005 }
1006
1007 if (fast) {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001008 Py_MEMCPY((char*)to_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001009 + PyUnicode_KIND_SIZE(to_kind, to_start),
Victor Stinnera0702ab2011-09-29 14:14:38 +02001010 (char*)from_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001011 + PyUnicode_KIND_SIZE(from_kind, from_start),
1012 PyUnicode_KIND_SIZE(to_kind, how_many));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001013 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001014 else if (from_kind == PyUnicode_1BYTE_KIND
1015 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001016 {
1017 _PyUnicode_CONVERT_BYTES(
1018 Py_UCS1, Py_UCS2,
1019 PyUnicode_1BYTE_DATA(from) + from_start,
1020 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1021 PyUnicode_2BYTE_DATA(to) + to_start
1022 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001023 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001024 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001025 && to_kind == PyUnicode_4BYTE_KIND)
1026 {
1027 _PyUnicode_CONVERT_BYTES(
1028 Py_UCS1, Py_UCS4,
1029 PyUnicode_1BYTE_DATA(from) + from_start,
1030 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1031 PyUnicode_4BYTE_DATA(to) + to_start
1032 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001033 }
1034 else if (from_kind == PyUnicode_2BYTE_KIND
1035 && to_kind == PyUnicode_4BYTE_KIND)
1036 {
1037 _PyUnicode_CONVERT_BYTES(
1038 Py_UCS2, Py_UCS4,
1039 PyUnicode_2BYTE_DATA(from) + from_start,
1040 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1041 PyUnicode_4BYTE_DATA(to) + to_start
1042 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001043 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001044 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001045 /* check if max_char(from substring) <= max_char(to) */
1046 if (from_kind > to_kind
1047 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001048 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001049 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001050 /* slow path to check for character overflow */
1051 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001052 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001053 Py_ssize_t i;
1054
Victor Stinner56c161a2011-10-06 02:47:11 +02001055#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001056 for (i=0; i < how_many; i++) {
1057 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001058 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001059 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1060 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001061#else
1062 if (!check_maxchar) {
1063 for (i=0; i < how_many; i++) {
1064 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1065 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1066 }
1067 }
1068 else {
1069 for (i=0; i < how_many; i++) {
1070 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1071 if (ch > to_maxchar)
1072 return 1;
1073 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1074 }
1075 }
1076#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001077 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001078 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001079 assert(0 && "inconsistent state");
1080 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001081 }
1082 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001083 return 0;
1084}
1085
1086static void
1087copy_characters(PyObject *to, Py_ssize_t to_start,
1088 PyObject *from, Py_ssize_t from_start,
1089 Py_ssize_t how_many)
1090{
1091 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1092}
1093
1094Py_ssize_t
1095PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1096 PyObject *from, Py_ssize_t from_start,
1097 Py_ssize_t how_many)
1098{
1099 int err;
1100
1101 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1102 PyErr_BadInternalCall();
1103 return -1;
1104 }
1105
1106 if (PyUnicode_READY(from))
1107 return -1;
1108 if (PyUnicode_READY(to))
1109 return -1;
1110
1111 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1112 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1113 PyErr_Format(PyExc_SystemError,
1114 "Cannot write %zi characters at %zi "
1115 "in a string of %zi characters",
1116 how_many, to_start, PyUnicode_GET_LENGTH(to));
1117 return -1;
1118 }
1119
1120 if (how_many == 0)
1121 return 0;
1122
1123 if (_PyUnicode_Dirty(to))
1124 return -1;
1125
1126 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1127 if (err) {
1128 PyErr_Format(PyExc_SystemError,
1129 "Cannot copy %s characters "
1130 "into a string of %s characters",
1131 unicode_kind_name(from),
1132 unicode_kind_name(to));
1133 return -1;
1134 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001135 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001136}
1137
Victor Stinner17222162011-09-28 22:15:37 +02001138/* Find the maximum code point and count the number of surrogate pairs so a
1139 correct string length can be computed before converting a string to UCS4.
1140 This function counts single surrogates as a character and not as a pair.
1141
1142 Return 0 on success, or -1 on error. */
1143static int
1144find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1145 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001146{
1147 const wchar_t *iter;
1148
Victor Stinnerc53be962011-10-02 21:33:54 +02001149 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001150 *num_surrogates = 0;
1151 *maxchar = 0;
1152
1153 for (iter = begin; iter < end; ) {
Victor Stinnerae864852011-10-05 14:02:44 +02001154 if (*iter > *maxchar) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001155 *maxchar = *iter;
Victor Stinnerae864852011-10-05 14:02:44 +02001156#if SIZEOF_WCHAR_T != 2
1157 if (*maxchar >= 0x10000)
1158 return 0;
1159#endif
1160 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001161#if SIZEOF_WCHAR_T == 2
1162 if (*iter >= 0xD800 && *iter <= 0xDBFF
1163 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1164 {
1165 Py_UCS4 surrogate_val;
1166 surrogate_val = (((iter[0] & 0x3FF)<<10)
1167 | (iter[1] & 0x3FF)) + 0x10000;
1168 ++(*num_surrogates);
1169 if (surrogate_val > *maxchar)
1170 *maxchar = surrogate_val;
1171 iter += 2;
1172 }
1173 else
1174 iter++;
1175#else
1176 iter++;
1177#endif
1178 }
1179 return 0;
1180}
1181
1182#ifdef Py_DEBUG
1183int unicode_ready_calls = 0;
1184#endif
1185
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001186static int
1187unicode_ready(PyObject **p_obj, int replace)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001188{
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001189 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001190 wchar_t *end;
1191 Py_UCS4 maxchar = 0;
1192 Py_ssize_t num_surrogates;
1193#if SIZEOF_WCHAR_T == 2
1194 Py_ssize_t length_wo_surrogates;
1195#endif
1196
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001197 assert(p_obj != NULL);
1198 unicode = (PyUnicodeObject *)*p_obj;
1199
Georg Brandl7597add2011-10-05 16:36:47 +02001200 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001201 strings were created using _PyObject_New() and where no canonical
1202 representation (the str field) has been set yet aka strings
1203 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001204 assert(_PyUnicode_CHECK(unicode));
1205 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001206 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001207 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001208 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001209 /* Actually, it should neither be interned nor be anything else: */
1210 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001211
1212#ifdef Py_DEBUG
1213 ++unicode_ready_calls;
1214#endif
1215
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001216#ifdef Py_DEBUG
1217 assert(!replace || Py_REFCNT(unicode) == 1);
1218#else
1219 if (replace && Py_REFCNT(unicode) != 1)
1220 replace = 0;
1221#endif
1222 if (replace) {
1223 Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode);
1224 wchar_t *wstr = _PyUnicode_WSTR(unicode);
1225 /* Optimization for empty strings */
1226 if (len == 0) {
1227 Py_INCREF(unicode_empty);
1228 Py_DECREF(*p_obj);
1229 *p_obj = unicode_empty;
1230 return 0;
1231 }
1232 if (len == 1 && wstr[0] < 256) {
1233 PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]);
1234 if (latin1_char == NULL)
1235 return -1;
1236 Py_DECREF(*p_obj);
1237 *p_obj = latin1_char;
1238 return 0;
1239 }
1240 }
1241
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001242 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001243 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001244 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001245 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001246
1247 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001248 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1249 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001250 PyErr_NoMemory();
1251 return -1;
1252 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001253 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001254 _PyUnicode_WSTR(unicode), end,
1255 PyUnicode_1BYTE_DATA(unicode));
1256 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1257 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1258 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1259 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001260 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001261 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001262 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001263 }
1264 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001265 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001266 _PyUnicode_UTF8(unicode) = NULL;
1267 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001268 }
1269 PyObject_FREE(_PyUnicode_WSTR(unicode));
1270 _PyUnicode_WSTR(unicode) = NULL;
1271 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1272 }
1273 /* In this case we might have to convert down from 4-byte native
1274 wchar_t to 2-byte unicode. */
1275 else if (maxchar < 65536) {
1276 assert(num_surrogates == 0 &&
1277 "FindMaxCharAndNumSurrogatePairs() messed up");
1278
Victor Stinner506f5922011-09-28 22:34:18 +02001279#if SIZEOF_WCHAR_T == 2
1280 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001281 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001282 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1283 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1284 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001285 _PyUnicode_UTF8(unicode) = NULL;
1286 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001287#else
1288 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001289 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001290 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001291 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001292 PyErr_NoMemory();
1293 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001294 }
Victor Stinner506f5922011-09-28 22:34:18 +02001295 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1296 _PyUnicode_WSTR(unicode), end,
1297 PyUnicode_2BYTE_DATA(unicode));
1298 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1299 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1300 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001301 _PyUnicode_UTF8(unicode) = NULL;
1302 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001303 PyObject_FREE(_PyUnicode_WSTR(unicode));
1304 _PyUnicode_WSTR(unicode) = NULL;
1305 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1306#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001307 }
1308 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1309 else {
1310#if SIZEOF_WCHAR_T == 2
1311 /* in case the native representation is 2-bytes, we need to allocate a
1312 new normalized 4-byte version. */
1313 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001314 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1315 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001316 PyErr_NoMemory();
1317 return -1;
1318 }
1319 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1320 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001321 _PyUnicode_UTF8(unicode) = NULL;
1322 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001323 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1324 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001325 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001326 PyObject_FREE(_PyUnicode_WSTR(unicode));
1327 _PyUnicode_WSTR(unicode) = NULL;
1328 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1329#else
1330 assert(num_surrogates == 0);
1331
Victor Stinnerc3c74152011-10-02 20:39:55 +02001332 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001333 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001334 _PyUnicode_UTF8(unicode) = NULL;
1335 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001336 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1337#endif
1338 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1339 }
1340 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001341 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001342 return 0;
1343}
1344
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001345int
1346_PyUnicode_ReadyReplace(PyObject **op)
1347{
1348 return unicode_ready(op, 1);
1349}
1350
1351int
1352_PyUnicode_Ready(PyObject *op)
1353{
1354 return unicode_ready(&op, 0);
1355}
1356
Alexander Belopolsky40018472011-02-26 01:02:56 +00001357static void
1358unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001359{
Walter Dörwald16807132007-05-25 13:52:07 +00001360 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001361 case SSTATE_NOT_INTERNED:
1362 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001363
Benjamin Peterson29060642009-01-31 22:14:21 +00001364 case SSTATE_INTERNED_MORTAL:
1365 /* revive dead object temporarily for DelItem */
1366 Py_REFCNT(unicode) = 3;
1367 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
1368 Py_FatalError(
1369 "deletion of interned string failed");
1370 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001371
Benjamin Peterson29060642009-01-31 22:14:21 +00001372 case SSTATE_INTERNED_IMMORTAL:
1373 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001374
Benjamin Peterson29060642009-01-31 22:14:21 +00001375 default:
1376 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001377 }
1378
Victor Stinner03490912011-10-03 23:45:12 +02001379 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001380 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001381 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001382 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001383
1384 if (PyUnicode_IS_COMPACT(unicode)) {
1385 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001386 }
1387 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001388 if (_PyUnicode_DATA_ANY(unicode))
1389 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Benjamin Peterson29060642009-01-31 22:14:21 +00001390 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001391 }
1392}
1393
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001394#ifdef Py_DEBUG
1395static int
1396unicode_is_singleton(PyObject *unicode)
1397{
1398 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1399 if (unicode == unicode_empty)
1400 return 1;
1401 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1402 {
1403 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1404 if (ch < 256 && unicode_latin1[ch] == unicode)
1405 return 1;
1406 }
1407 return 0;
1408}
1409#endif
1410
Alexander Belopolsky40018472011-02-26 01:02:56 +00001411static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001412unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001413{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001414 if (Py_REFCNT(unicode) != 1)
1415 return 0;
1416 if (PyUnicode_CHECK_INTERNED(unicode))
1417 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001418#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001419 /* singleton refcount is greater than 1 */
1420 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001421#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001422 return 1;
1423}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001424
Victor Stinnerfe226c02011-10-03 03:52:20 +02001425static int
1426unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1427{
1428 PyObject *unicode;
1429 Py_ssize_t old_length;
1430
1431 assert(p_unicode != NULL);
1432 unicode = *p_unicode;
1433
1434 assert(unicode != NULL);
1435 assert(PyUnicode_Check(unicode));
1436 assert(0 <= length);
1437
Victor Stinner910337b2011-10-03 03:20:16 +02001438 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001439 old_length = PyUnicode_WSTR_LENGTH(unicode);
1440 else
1441 old_length = PyUnicode_GET_LENGTH(unicode);
1442 if (old_length == length)
1443 return 0;
1444
Victor Stinnerfe226c02011-10-03 03:52:20 +02001445 if (!unicode_resizable(unicode)) {
1446 PyObject *copy = resize_copy(unicode, length);
1447 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001448 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001449 Py_DECREF(*p_unicode);
1450 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001451 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001452 }
1453
Victor Stinnerfe226c02011-10-03 03:52:20 +02001454 if (PyUnicode_IS_COMPACT(unicode)) {
1455 *p_unicode = resize_compact(unicode, length);
1456 if (*p_unicode == NULL)
1457 return -1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001458 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001459 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001460 }
1461 return resize_inplace((PyUnicodeObject*)unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001462}
1463
Alexander Belopolsky40018472011-02-26 01:02:56 +00001464int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001465PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001466{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001467 PyObject *unicode;
1468 if (p_unicode == NULL) {
1469 PyErr_BadInternalCall();
1470 return -1;
1471 }
1472 unicode = *p_unicode;
1473 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0
1474 || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND)
1475 {
1476 PyErr_BadInternalCall();
1477 return -1;
1478 }
1479 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001480}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001481
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001482static PyObject*
1483get_latin1_char(unsigned char ch)
1484{
Victor Stinnera464fc12011-10-02 20:39:30 +02001485 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001486 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001487 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001488 if (!unicode)
1489 return NULL;
1490 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001491 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001492 unicode_latin1[ch] = unicode;
1493 }
1494 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001495 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001496}
1497
Alexander Belopolsky40018472011-02-26 01:02:56 +00001498PyObject *
1499PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001500{
1501 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001502 Py_UCS4 maxchar = 0;
1503 Py_ssize_t num_surrogates;
1504
1505 if (u == NULL)
1506 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001507
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001508 /* If the Unicode data is known at construction time, we can apply
1509 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001510
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001511 /* Optimization for empty strings */
1512 if (size == 0 && unicode_empty != NULL) {
1513 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001514 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001515 }
Tim Petersced69f82003-09-16 20:30:58 +00001516
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001517 /* Single character Unicode objects in the Latin-1 range are
1518 shared when using this constructor */
1519 if (size == 1 && *u < 256)
1520 return get_latin1_char((unsigned char)*u);
1521
1522 /* If not empty and not single character, copy the Unicode data
1523 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001524 if (find_maxchar_surrogates(u, u + size,
1525 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001526 return NULL;
1527
1528 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1529 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001530 if (!unicode)
1531 return NULL;
1532
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001533 switch (PyUnicode_KIND(unicode)) {
1534 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001535 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001536 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1537 break;
1538 case PyUnicode_2BYTE_KIND:
1539#if Py_UNICODE_SIZE == 2
1540 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1541#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001542 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001543 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1544#endif
1545 break;
1546 case PyUnicode_4BYTE_KIND:
1547#if SIZEOF_WCHAR_T == 2
1548 /* This is the only case which has to process surrogates, thus
1549 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001550 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001551#else
1552 assert(num_surrogates == 0);
1553 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1554#endif
1555 break;
1556 default:
1557 assert(0 && "Impossible state");
1558 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001559
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001560 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001561 return (PyObject *)unicode;
1562}
1563
Alexander Belopolsky40018472011-02-26 01:02:56 +00001564PyObject *
1565PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001566{
1567 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001568
Benjamin Peterson14339b62009-01-31 16:36:08 +00001569 if (size < 0) {
1570 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001571 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001572 return NULL;
1573 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001574
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001575 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001576 some optimizations which share commonly used objects.
1577 Also, this means the input must be UTF-8, so fall back to the
1578 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001579 if (u != NULL) {
1580
Benjamin Peterson29060642009-01-31 22:14:21 +00001581 /* Optimization for empty strings */
1582 if (size == 0 && unicode_empty != NULL) {
1583 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001584 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001585 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001586
1587 /* Single characters are shared when using this constructor.
1588 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001589 if (size == 1 && Py_CHARMASK(*u) < 128)
1590 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001591
1592 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001593 }
1594
Walter Dörwald55507312007-05-18 13:12:10 +00001595 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001596 if (!unicode)
1597 return NULL;
1598
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001599 return (PyObject *)unicode;
1600}
1601
Alexander Belopolsky40018472011-02-26 01:02:56 +00001602PyObject *
1603PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001604{
1605 size_t size = strlen(u);
1606 if (size > PY_SSIZE_T_MAX) {
1607 PyErr_SetString(PyExc_OverflowError, "input too long");
1608 return NULL;
1609 }
1610
1611 return PyUnicode_FromStringAndSize(u, size);
1612}
1613
Victor Stinnere57b1c02011-09-28 22:20:48 +02001614static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001615unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001616{
Victor Stinner0617b6e2011-10-05 23:26:01 +02001617 PyObject *res;
1618#ifdef Py_DEBUG
1619 const unsigned char *p;
1620 const unsigned char *end = s + size;
1621 for (p=s; p < end; p++) {
1622 assert(*p < 128);
1623 }
1624#endif
1625 res = PyUnicode_New(size, 127);
Victor Stinner702c7342011-10-05 13:50:52 +02001626 if (!res)
1627 return NULL;
Victor Stinner0617b6e2011-10-05 23:26:01 +02001628 memcpy(PyUnicode_1BYTE_DATA(res), s, size);
Victor Stinner702c7342011-10-05 13:50:52 +02001629 return res;
1630}
1631
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001632static Py_UCS4
1633kind_maxchar_limit(unsigned int kind)
1634{
1635 switch(kind) {
1636 case PyUnicode_1BYTE_KIND:
1637 return 0x80;
1638 case PyUnicode_2BYTE_KIND:
1639 return 0x100;
1640 case PyUnicode_4BYTE_KIND:
1641 return 0x10000;
1642 default:
1643 assert(0 && "invalid kind");
1644 return 0x10ffff;
1645 }
1646}
1647
Victor Stinner702c7342011-10-05 13:50:52 +02001648static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001649_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001650{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001651 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001652 unsigned char max_char = 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001653 Py_ssize_t i;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001654
1655 assert(size >= 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001656 for (i = 0; i < size; i++) {
1657 if (u[i] & 0x80) {
Victor Stinnerb9275c12011-10-05 14:01:42 +02001658 max_char = 255;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001659 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001660 }
1661 }
Victor Stinnerb9275c12011-10-05 14:01:42 +02001662 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001663 if (!res)
1664 return NULL;
1665 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001666 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001667 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001668}
1669
Victor Stinnere57b1c02011-09-28 22:20:48 +02001670static PyObject*
1671_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001672{
1673 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001674 Py_UCS2 max_char = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001675 Py_ssize_t i;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001676
1677 assert(size >= 0);
1678 for (i = 0; i < size; i++) {
1679 if (u[i] > max_char) {
1680 max_char = u[i];
1681 if (max_char >= 256)
1682 break;
1683 }
1684 }
1685 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001686 if (!res)
1687 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001688 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001689 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1690 else
1691 for (i = 0; i < size; i++)
1692 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001693 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001694 return res;
1695}
1696
Victor Stinnere57b1c02011-09-28 22:20:48 +02001697static PyObject*
1698_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001699{
1700 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001701 Py_UCS4 max_char = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001702 Py_ssize_t i;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001703
1704 assert(size >= 0);
1705 for (i = 0; i < size; i++) {
1706 if (u[i] > max_char) {
1707 max_char = u[i];
1708 if (max_char >= 0x10000)
1709 break;
1710 }
1711 }
1712 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001713 if (!res)
1714 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001715 if (max_char >= 0x10000)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001716 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1717 else {
1718 int kind = PyUnicode_KIND(res);
1719 void *data = PyUnicode_DATA(res);
1720 for (i = 0; i < size; i++)
1721 PyUnicode_WRITE(kind, data, i, u[i]);
1722 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001723 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001724 return res;
1725}
1726
1727PyObject*
1728PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1729{
1730 switch(kind) {
1731 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001732 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001733 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001734 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001735 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001736 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001737 default:
1738 assert(0 && "invalid kind");
1739 PyErr_SetString(PyExc_SystemError, "invalid kind");
1740 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001741 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001742}
1743
Victor Stinner25a4b292011-10-06 12:31:55 +02001744/* Ensure that a string uses the most efficient storage, if it is not the
1745 case: create a new string with of the right kind. Write NULL into *p_unicode
1746 on error. */
1747void
1748unicode_adjust_maxchar(PyObject **p_unicode)
1749{
1750 PyObject *unicode, *copy;
1751 Py_UCS4 max_char;
1752 Py_ssize_t i, len;
1753 unsigned int kind;
1754
1755 assert(p_unicode != NULL);
1756 unicode = *p_unicode;
1757 assert(PyUnicode_IS_READY(unicode));
1758 if (PyUnicode_IS_ASCII(unicode))
1759 return;
1760
1761 len = PyUnicode_GET_LENGTH(unicode);
1762 kind = PyUnicode_KIND(unicode);
1763 if (kind == PyUnicode_1BYTE_KIND) {
1764 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
1765 for (i = 0; i < len; i++) {
1766 if (u[i] & 0x80)
1767 return;
1768 }
1769 max_char = 127;
1770 }
1771 else if (kind == PyUnicode_2BYTE_KIND) {
1772 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
1773 max_char = 0;
1774 for (i = 0; i < len; i++) {
1775 if (u[i] > max_char) {
1776 max_char = u[i];
1777 if (max_char >= 256)
1778 return;
1779 }
1780 }
1781 }
1782 else {
Antoine Pitrou15a66cf2011-10-06 15:25:32 +02001783 const Py_UCS4 *u;
Victor Stinner25a4b292011-10-06 12:31:55 +02001784 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitrou15a66cf2011-10-06 15:25:32 +02001785 u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02001786 max_char = 0;
1787 for (i = 0; i < len; i++) {
1788 if (u[i] > max_char) {
1789 max_char = u[i];
1790 if (max_char >= 0x10000)
1791 return;
1792 }
1793 }
1794 }
Victor Stinner200f2132011-10-06 13:27:56 +02001795 assert(max_char < PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinner25a4b292011-10-06 12:31:55 +02001796 copy = PyUnicode_New(len, max_char);
1797 copy_characters(copy, 0, unicode, 0, len);
1798 Py_DECREF(unicode);
1799 *p_unicode = copy;
1800}
1801
Victor Stinner034f6cf2011-09-30 02:26:44 +02001802PyObject*
1803PyUnicode_Copy(PyObject *unicode)
1804{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001805 Py_ssize_t size;
1806 PyObject *copy;
1807 void *data;
1808
Victor Stinner034f6cf2011-09-30 02:26:44 +02001809 if (!PyUnicode_Check(unicode)) {
1810 PyErr_BadInternalCall();
1811 return NULL;
1812 }
1813 if (PyUnicode_READY(unicode))
1814 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001815
1816 size = PyUnicode_GET_LENGTH(unicode);
1817 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1818 if (!copy)
1819 return NULL;
1820 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1821
1822 data = PyUnicode_DATA(unicode);
1823 switch (PyUnicode_KIND(unicode))
1824 {
1825 case PyUnicode_1BYTE_KIND:
1826 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1827 break;
1828 case PyUnicode_2BYTE_KIND:
1829 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1830 break;
1831 case PyUnicode_4BYTE_KIND:
1832 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1833 break;
1834 default:
1835 assert(0);
1836 break;
1837 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001838 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001839 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001840}
1841
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001842
Victor Stinnerbc603d12011-10-02 01:00:40 +02001843/* Widen Unicode objects to larger buffers. Don't write terminating null
1844 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001845
1846void*
1847_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1848{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001849 Py_ssize_t len;
1850 void *result;
1851 unsigned int skind;
1852
1853 if (PyUnicode_READY(s))
1854 return NULL;
1855
1856 len = PyUnicode_GET_LENGTH(s);
1857 skind = PyUnicode_KIND(s);
1858 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02001859 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001860 return NULL;
1861 }
1862 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001863 case PyUnicode_2BYTE_KIND:
1864 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1865 if (!result)
1866 return PyErr_NoMemory();
1867 assert(skind == PyUnicode_1BYTE_KIND);
1868 _PyUnicode_CONVERT_BYTES(
1869 Py_UCS1, Py_UCS2,
1870 PyUnicode_1BYTE_DATA(s),
1871 PyUnicode_1BYTE_DATA(s) + len,
1872 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001873 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001874 case PyUnicode_4BYTE_KIND:
1875 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1876 if (!result)
1877 return PyErr_NoMemory();
1878 if (skind == PyUnicode_2BYTE_KIND) {
1879 _PyUnicode_CONVERT_BYTES(
1880 Py_UCS2, Py_UCS4,
1881 PyUnicode_2BYTE_DATA(s),
1882 PyUnicode_2BYTE_DATA(s) + len,
1883 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001884 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001885 else {
1886 assert(skind == PyUnicode_1BYTE_KIND);
1887 _PyUnicode_CONVERT_BYTES(
1888 Py_UCS1, Py_UCS4,
1889 PyUnicode_1BYTE_DATA(s),
1890 PyUnicode_1BYTE_DATA(s) + len,
1891 result);
1892 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001893 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001894 default:
1895 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001896 }
Victor Stinner01698042011-10-04 00:04:26 +02001897 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001898 return NULL;
1899}
1900
1901static Py_UCS4*
1902as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1903 int copy_null)
1904{
1905 int kind;
1906 void *data;
1907 Py_ssize_t len, targetlen;
1908 if (PyUnicode_READY(string) == -1)
1909 return NULL;
1910 kind = PyUnicode_KIND(string);
1911 data = PyUnicode_DATA(string);
1912 len = PyUnicode_GET_LENGTH(string);
1913 targetlen = len;
1914 if (copy_null)
1915 targetlen++;
1916 if (!target) {
1917 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1918 PyErr_NoMemory();
1919 return NULL;
1920 }
1921 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1922 if (!target) {
1923 PyErr_NoMemory();
1924 return NULL;
1925 }
1926 }
1927 else {
1928 if (targetsize < targetlen) {
1929 PyErr_Format(PyExc_SystemError,
1930 "string is longer than the buffer");
1931 if (copy_null && 0 < targetsize)
1932 target[0] = 0;
1933 return NULL;
1934 }
1935 }
1936 if (kind != PyUnicode_4BYTE_KIND) {
1937 Py_ssize_t i;
1938 for (i = 0; i < len; i++)
1939 target[i] = PyUnicode_READ(kind, data, i);
1940 }
1941 else
1942 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1943 if (copy_null)
1944 target[len] = 0;
1945 return target;
1946}
1947
1948Py_UCS4*
1949PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1950 int copy_null)
1951{
1952 if (target == NULL || targetsize < 1) {
1953 PyErr_BadInternalCall();
1954 return NULL;
1955 }
1956 return as_ucs4(string, target, targetsize, copy_null);
1957}
1958
1959Py_UCS4*
1960PyUnicode_AsUCS4Copy(PyObject *string)
1961{
1962 return as_ucs4(string, NULL, 0, 1);
1963}
1964
1965#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00001966
Alexander Belopolsky40018472011-02-26 01:02:56 +00001967PyObject *
1968PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001969{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001970 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00001971 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001972 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001973 PyErr_BadInternalCall();
1974 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001975 }
1976
Martin v. Löwis790465f2008-04-05 20:41:37 +00001977 if (size == -1) {
1978 size = wcslen(w);
1979 }
1980
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001981 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001982}
1983
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001984#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00001985
Walter Dörwald346737f2007-05-31 10:44:43 +00001986static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001987makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1988 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00001989{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001990 *fmt++ = '%';
1991 if (width) {
1992 if (zeropad)
1993 *fmt++ = '0';
1994 fmt += sprintf(fmt, "%d", width);
1995 }
1996 if (precision)
1997 fmt += sprintf(fmt, ".%d", precision);
1998 if (longflag)
1999 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002000 else if (longlongflag) {
2001 /* longlongflag should only ever be nonzero on machines with
2002 HAVE_LONG_LONG defined */
2003#ifdef HAVE_LONG_LONG
2004 char *f = PY_FORMAT_LONG_LONG;
2005 while (*f)
2006 *fmt++ = *f++;
2007#else
2008 /* we shouldn't ever get here */
2009 assert(0);
2010 *fmt++ = 'l';
2011#endif
2012 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002013 else if (size_tflag) {
2014 char *f = PY_FORMAT_SIZE_T;
2015 while (*f)
2016 *fmt++ = *f++;
2017 }
2018 *fmt++ = c;
2019 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002020}
2021
Victor Stinner96865452011-03-01 23:44:09 +00002022/* helper for PyUnicode_FromFormatV() */
2023
2024static const char*
2025parse_format_flags(const char *f,
2026 int *p_width, int *p_precision,
2027 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2028{
2029 int width, precision, longflag, longlongflag, size_tflag;
2030
2031 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2032 f++;
2033 width = 0;
2034 while (Py_ISDIGIT((unsigned)*f))
2035 width = (width*10) + *f++ - '0';
2036 precision = 0;
2037 if (*f == '.') {
2038 f++;
2039 while (Py_ISDIGIT((unsigned)*f))
2040 precision = (precision*10) + *f++ - '0';
2041 if (*f == '%') {
2042 /* "%.3%s" => f points to "3" */
2043 f--;
2044 }
2045 }
2046 if (*f == '\0') {
2047 /* bogus format "%.1" => go backward, f points to "1" */
2048 f--;
2049 }
2050 if (p_width != NULL)
2051 *p_width = width;
2052 if (p_precision != NULL)
2053 *p_precision = precision;
2054
2055 /* Handle %ld, %lu, %lld and %llu. */
2056 longflag = 0;
2057 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002058 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002059
2060 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002061 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002062 longflag = 1;
2063 ++f;
2064 }
2065#ifdef HAVE_LONG_LONG
2066 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002067 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002068 longlongflag = 1;
2069 f += 2;
2070 }
2071#endif
2072 }
2073 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002074 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002075 size_tflag = 1;
2076 ++f;
2077 }
2078 if (p_longflag != NULL)
2079 *p_longflag = longflag;
2080 if (p_longlongflag != NULL)
2081 *p_longlongflag = longlongflag;
2082 if (p_size_tflag != NULL)
2083 *p_size_tflag = size_tflag;
2084 return f;
2085}
2086
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002087/* maximum number of characters required for output of %ld. 21 characters
2088 allows for 64-bit integers (in decimal) and an optional sign. */
2089#define MAX_LONG_CHARS 21
2090/* maximum number of characters required for output of %lld.
2091 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2092 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2093#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2094
Walter Dörwaldd2034312007-05-18 16:29:38 +00002095PyObject *
2096PyUnicode_FromFormatV(const char *format, va_list vargs)
2097{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002098 va_list count;
2099 Py_ssize_t callcount = 0;
2100 PyObject **callresults = NULL;
2101 PyObject **callresult = NULL;
2102 Py_ssize_t n = 0;
2103 int width = 0;
2104 int precision = 0;
2105 int zeropad;
2106 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002107 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002108 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002109 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002110 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2111 Py_UCS4 argmaxchar;
2112 Py_ssize_t numbersize = 0;
2113 char *numberresults = NULL;
2114 char *numberresult = NULL;
2115 Py_ssize_t i;
2116 int kind;
2117 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002118
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002119 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002120 /* step 1: count the number of %S/%R/%A/%s format specifications
2121 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2122 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002123 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002124 * also estimate a upper bound for all the number formats in the string,
2125 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002126 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002127 for (f = format; *f; f++) {
2128 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002129 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002130 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2131 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2132 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2133 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002134
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002135 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002136#ifdef HAVE_LONG_LONG
2137 if (longlongflag) {
2138 if (width < MAX_LONG_LONG_CHARS)
2139 width = MAX_LONG_LONG_CHARS;
2140 }
2141 else
2142#endif
2143 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2144 including sign. Decimal takes the most space. This
2145 isn't enough for octal. If a width is specified we
2146 need more (which we allocate later). */
2147 if (width < MAX_LONG_CHARS)
2148 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002149
2150 /* account for the size + '\0' to separate numbers
2151 inside of the numberresults buffer */
2152 numbersize += (width + 1);
2153 }
2154 }
2155 else if ((unsigned char)*f > 127) {
2156 PyErr_Format(PyExc_ValueError,
2157 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2158 "string, got a non-ASCII byte: 0x%02x",
2159 (unsigned char)*f);
2160 return NULL;
2161 }
2162 }
2163 /* step 2: allocate memory for the results of
2164 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2165 if (callcount) {
2166 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2167 if (!callresults) {
2168 PyErr_NoMemory();
2169 return NULL;
2170 }
2171 callresult = callresults;
2172 }
2173 /* step 2.5: allocate memory for the results of formating numbers */
2174 if (numbersize) {
2175 numberresults = PyObject_Malloc(numbersize);
2176 if (!numberresults) {
2177 PyErr_NoMemory();
2178 goto fail;
2179 }
2180 numberresult = numberresults;
2181 }
2182
2183 /* step 3: format numbers and figure out how large a buffer we need */
2184 for (f = format; *f; f++) {
2185 if (*f == '%') {
2186 const char* p;
2187 int longflag;
2188 int longlongflag;
2189 int size_tflag;
2190 int numprinted;
2191
2192 p = f;
2193 zeropad = (f[1] == '0');
2194 f = parse_format_flags(f, &width, &precision,
2195 &longflag, &longlongflag, &size_tflag);
2196 switch (*f) {
2197 case 'c':
2198 {
2199 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002200 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002201 n++;
2202 break;
2203 }
2204 case '%':
2205 n++;
2206 break;
2207 case 'i':
2208 case 'd':
2209 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2210 width, precision, *f);
2211 if (longflag)
2212 numprinted = sprintf(numberresult, fmt,
2213 va_arg(count, long));
2214#ifdef HAVE_LONG_LONG
2215 else if (longlongflag)
2216 numprinted = sprintf(numberresult, fmt,
2217 va_arg(count, PY_LONG_LONG));
2218#endif
2219 else if (size_tflag)
2220 numprinted = sprintf(numberresult, fmt,
2221 va_arg(count, Py_ssize_t));
2222 else
2223 numprinted = sprintf(numberresult, fmt,
2224 va_arg(count, int));
2225 n += numprinted;
2226 /* advance by +1 to skip over the '\0' */
2227 numberresult += (numprinted + 1);
2228 assert(*(numberresult - 1) == '\0');
2229 assert(*(numberresult - 2) != '\0');
2230 assert(numprinted >= 0);
2231 assert(numberresult <= numberresults + numbersize);
2232 break;
2233 case 'u':
2234 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2235 width, precision, 'u');
2236 if (longflag)
2237 numprinted = sprintf(numberresult, fmt,
2238 va_arg(count, unsigned long));
2239#ifdef HAVE_LONG_LONG
2240 else if (longlongflag)
2241 numprinted = sprintf(numberresult, fmt,
2242 va_arg(count, unsigned PY_LONG_LONG));
2243#endif
2244 else if (size_tflag)
2245 numprinted = sprintf(numberresult, fmt,
2246 va_arg(count, size_t));
2247 else
2248 numprinted = sprintf(numberresult, fmt,
2249 va_arg(count, unsigned int));
2250 n += numprinted;
2251 numberresult += (numprinted + 1);
2252 assert(*(numberresult - 1) == '\0');
2253 assert(*(numberresult - 2) != '\0');
2254 assert(numprinted >= 0);
2255 assert(numberresult <= numberresults + numbersize);
2256 break;
2257 case 'x':
2258 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2259 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2260 n += numprinted;
2261 numberresult += (numprinted + 1);
2262 assert(*(numberresult - 1) == '\0');
2263 assert(*(numberresult - 2) != '\0');
2264 assert(numprinted >= 0);
2265 assert(numberresult <= numberresults + numbersize);
2266 break;
2267 case 'p':
2268 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2269 /* %p is ill-defined: ensure leading 0x. */
2270 if (numberresult[1] == 'X')
2271 numberresult[1] = 'x';
2272 else if (numberresult[1] != 'x') {
2273 memmove(numberresult + 2, numberresult,
2274 strlen(numberresult) + 1);
2275 numberresult[0] = '0';
2276 numberresult[1] = 'x';
2277 numprinted += 2;
2278 }
2279 n += numprinted;
2280 numberresult += (numprinted + 1);
2281 assert(*(numberresult - 1) == '\0');
2282 assert(*(numberresult - 2) != '\0');
2283 assert(numprinted >= 0);
2284 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002285 break;
2286 case 's':
2287 {
2288 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002289 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002290 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2291 if (!str)
2292 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002293 /* since PyUnicode_DecodeUTF8 returns already flexible
2294 unicode objects, there is no need to call ready on them */
2295 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002296 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002297 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002298 /* Remember the str and switch to the next slot */
2299 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002300 break;
2301 }
2302 case 'U':
2303 {
2304 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002305 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002306 if (PyUnicode_READY(obj) == -1)
2307 goto fail;
2308 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002309 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002310 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002311 break;
2312 }
2313 case 'V':
2314 {
2315 PyObject *obj = va_arg(count, PyObject *);
2316 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002317 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002318 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002319 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002320 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002321 if (PyUnicode_READY(obj) == -1)
2322 goto fail;
2323 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002324 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002325 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002326 *callresult++ = NULL;
2327 }
2328 else {
2329 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2330 if (!str_obj)
2331 goto fail;
Victor Stinnere1335c72011-10-04 20:53:03 +02002332 if (PyUnicode_READY(str_obj)) {
2333 Py_DECREF(str_obj);
2334 goto fail;
2335 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002336 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002337 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002338 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002339 *callresult++ = str_obj;
2340 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002341 break;
2342 }
2343 case 'S':
2344 {
2345 PyObject *obj = va_arg(count, PyObject *);
2346 PyObject *str;
2347 assert(obj);
2348 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002349 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002350 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002351 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002352 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002353 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002354 /* Remember the str and switch to the next slot */
2355 *callresult++ = str;
2356 break;
2357 }
2358 case 'R':
2359 {
2360 PyObject *obj = va_arg(count, PyObject *);
2361 PyObject *repr;
2362 assert(obj);
2363 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002364 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002365 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002366 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002367 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002368 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002369 /* Remember the repr and switch to the next slot */
2370 *callresult++ = repr;
2371 break;
2372 }
2373 case 'A':
2374 {
2375 PyObject *obj = va_arg(count, PyObject *);
2376 PyObject *ascii;
2377 assert(obj);
2378 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002379 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002380 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002381 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002382 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002383 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002384 /* Remember the repr and switch to the next slot */
2385 *callresult++ = ascii;
2386 break;
2387 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002388 default:
2389 /* if we stumble upon an unknown
2390 formatting code, copy the rest of
2391 the format string to the output
2392 string. (we cannot just skip the
2393 code, since there's no way to know
2394 what's in the argument list) */
2395 n += strlen(p);
2396 goto expand;
2397 }
2398 } else
2399 n++;
2400 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002401 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002402 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002403 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002404 we don't have to resize the string.
2405 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002406 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002407 if (!string)
2408 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002409 kind = PyUnicode_KIND(string);
2410 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002411 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002412 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002413
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002414 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002415 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002416 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002417
2418 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002419 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2420 /* checking for == because the last argument could be a empty
2421 string, which causes i to point to end, the assert at the end of
2422 the loop */
2423 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002424
Benjamin Peterson14339b62009-01-31 16:36:08 +00002425 switch (*f) {
2426 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002427 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002428 const int ordinal = va_arg(vargs, int);
2429 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002430 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002431 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002432 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002433 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002434 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002435 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002436 case 'p':
2437 /* unused, since we already have the result */
2438 if (*f == 'p')
2439 (void) va_arg(vargs, void *);
2440 else
2441 (void) va_arg(vargs, int);
2442 /* extract the result from numberresults and append. */
2443 for (; *numberresult; ++i, ++numberresult)
2444 PyUnicode_WRITE(kind, data, i, *numberresult);
2445 /* skip over the separating '\0' */
2446 assert(*numberresult == '\0');
2447 numberresult++;
2448 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002449 break;
2450 case 's':
2451 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002452 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002453 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002454 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002455 size = PyUnicode_GET_LENGTH(*callresult);
2456 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002457 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002458 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002459 /* We're done with the unicode()/repr() => forget it */
2460 Py_DECREF(*callresult);
2461 /* switch to next unicode()/repr() result */
2462 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002463 break;
2464 }
2465 case 'U':
2466 {
2467 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002468 Py_ssize_t size;
2469 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2470 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002471 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002472 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002473 break;
2474 }
2475 case 'V':
2476 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002477 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002478 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002479 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002480 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002481 size = PyUnicode_GET_LENGTH(obj);
2482 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002483 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002484 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002485 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002486 size = PyUnicode_GET_LENGTH(*callresult);
2487 assert(PyUnicode_KIND(*callresult) <=
2488 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002489 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002490 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002491 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002492 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002493 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002494 break;
2495 }
2496 case 'S':
2497 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002498 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002499 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002500 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002501 /* unused, since we already have the result */
2502 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002503 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002504 copy_characters(string, i, *callresult, 0, size);
2505 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002506 /* We're done with the unicode()/repr() => forget it */
2507 Py_DECREF(*callresult);
2508 /* switch to next unicode()/repr() result */
2509 ++callresult;
2510 break;
2511 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002512 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002513 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002514 break;
2515 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002516 for (; *p; ++p, ++i)
2517 PyUnicode_WRITE(kind, data, i, *p);
2518 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002519 goto end;
2520 }
Victor Stinner1205f272010-09-11 00:54:47 +00002521 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002522 else {
2523 assert(i < PyUnicode_GET_LENGTH(string));
2524 PyUnicode_WRITE(kind, data, i++, *f);
2525 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002526 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002527 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002528
Benjamin Peterson29060642009-01-31 22:14:21 +00002529 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002530 if (callresults)
2531 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002532 if (numberresults)
2533 PyObject_Free(numberresults);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002534 assert(_PyUnicode_CheckConsistency(string, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002535 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00002536 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002537 if (callresults) {
2538 PyObject **callresult2 = callresults;
2539 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002540 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002541 ++callresult2;
2542 }
2543 PyObject_Free(callresults);
2544 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002545 if (numberresults)
2546 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002547 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002548}
2549
Walter Dörwaldd2034312007-05-18 16:29:38 +00002550PyObject *
2551PyUnicode_FromFormat(const char *format, ...)
2552{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002553 PyObject* ret;
2554 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002555
2556#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002557 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002558#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002559 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002560#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002561 ret = PyUnicode_FromFormatV(format, vargs);
2562 va_end(vargs);
2563 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002564}
2565
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002566#ifdef HAVE_WCHAR_H
2567
Victor Stinner5593d8a2010-10-02 11:11:27 +00002568/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2569 convert a Unicode object to a wide character string.
2570
Victor Stinnerd88d9832011-09-06 02:00:05 +02002571 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002572 character) required to convert the unicode object. Ignore size argument.
2573
Victor Stinnerd88d9832011-09-06 02:00:05 +02002574 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002575 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002576 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002577static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00002578unicode_aswidechar(PyUnicodeObject *unicode,
2579 wchar_t *w,
2580 Py_ssize_t size)
2581{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002582 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002583 const wchar_t *wstr;
2584
2585 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
2586 if (wstr == NULL)
2587 return -1;
2588
Victor Stinner5593d8a2010-10-02 11:11:27 +00002589 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002590 if (size > res)
2591 size = res + 1;
2592 else
2593 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002594 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002595 return res;
2596 }
2597 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002598 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002599}
2600
2601Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002602PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002603 wchar_t *w,
2604 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002605{
2606 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002607 PyErr_BadInternalCall();
2608 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002609 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002610 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002611}
2612
Victor Stinner137c34c2010-09-29 10:25:54 +00002613wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002614PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002615 Py_ssize_t *size)
2616{
2617 wchar_t* buffer;
2618 Py_ssize_t buflen;
2619
2620 if (unicode == NULL) {
2621 PyErr_BadInternalCall();
2622 return NULL;
2623 }
2624
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002625 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002626 if (buflen == -1)
2627 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002628 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002629 PyErr_NoMemory();
2630 return NULL;
2631 }
2632
Victor Stinner137c34c2010-09-29 10:25:54 +00002633 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2634 if (buffer == NULL) {
2635 PyErr_NoMemory();
2636 return NULL;
2637 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002638 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002639 if (buflen == -1)
2640 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002641 if (size != NULL)
2642 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002643 return buffer;
2644}
2645
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002646#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002647
Alexander Belopolsky40018472011-02-26 01:02:56 +00002648PyObject *
2649PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002650{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002651 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002652 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002653 PyErr_SetString(PyExc_ValueError,
2654 "chr() arg not in range(0x110000)");
2655 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002656 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002657
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002658 if (ordinal < 256)
2659 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002660
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002661 v = PyUnicode_New(1, ordinal);
2662 if (v == NULL)
2663 return NULL;
2664 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002665 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002666 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002667}
2668
Alexander Belopolsky40018472011-02-26 01:02:56 +00002669PyObject *
2670PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002671{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002672 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002673 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002674 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002675 if (PyUnicode_READY(obj))
2676 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002677 Py_INCREF(obj);
2678 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002679 }
2680 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002681 /* For a Unicode subtype that's not a Unicode object,
2682 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002683 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002684 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002685 PyErr_Format(PyExc_TypeError,
2686 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002687 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002688 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002689}
2690
Alexander Belopolsky40018472011-02-26 01:02:56 +00002691PyObject *
2692PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002693 const char *encoding,
2694 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002695{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002696 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002697 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002698
Guido van Rossumd57fd912000-03-10 22:53:23 +00002699 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002700 PyErr_BadInternalCall();
2701 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002702 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002703
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002704 /* Decoding bytes objects is the most common case and should be fast */
2705 if (PyBytes_Check(obj)) {
2706 if (PyBytes_GET_SIZE(obj) == 0) {
2707 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002708 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002709 }
2710 else {
2711 v = PyUnicode_Decode(
2712 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2713 encoding, errors);
2714 }
2715 return v;
2716 }
2717
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002718 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002719 PyErr_SetString(PyExc_TypeError,
2720 "decoding str is not supported");
2721 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002722 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002723
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002724 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2725 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2726 PyErr_Format(PyExc_TypeError,
2727 "coercing to str: need bytes, bytearray "
2728 "or buffer-like object, %.80s found",
2729 Py_TYPE(obj)->tp_name);
2730 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002731 }
Tim Petersced69f82003-09-16 20:30:58 +00002732
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002733 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002734 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002735 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002736 }
Tim Petersced69f82003-09-16 20:30:58 +00002737 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002738 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002739
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002740 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002741 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002742}
2743
Victor Stinner600d3be2010-06-10 12:00:55 +00002744/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002745 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2746 1 on success. */
2747static int
2748normalize_encoding(const char *encoding,
2749 char *lower,
2750 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002751{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002752 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002753 char *l;
2754 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002755
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002756 e = encoding;
2757 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002758 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002759 while (*e) {
2760 if (l == l_end)
2761 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002762 if (Py_ISUPPER(*e)) {
2763 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002764 }
2765 else if (*e == '_') {
2766 *l++ = '-';
2767 e++;
2768 }
2769 else {
2770 *l++ = *e++;
2771 }
2772 }
2773 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002774 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002775}
2776
Alexander Belopolsky40018472011-02-26 01:02:56 +00002777PyObject *
2778PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002779 Py_ssize_t size,
2780 const char *encoding,
2781 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002782{
2783 PyObject *buffer = NULL, *unicode;
2784 Py_buffer info;
2785 char lower[11]; /* Enough for any encoding shortcut */
2786
2787 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002788 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002789
2790 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002791 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002792 if ((strcmp(lower, "utf-8") == 0) ||
2793 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002794 return PyUnicode_DecodeUTF8(s, size, errors);
2795 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002796 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002797 (strcmp(lower, "iso-8859-1") == 0))
2798 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002799#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002800 else if (strcmp(lower, "mbcs") == 0)
2801 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002802#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002803 else if (strcmp(lower, "ascii") == 0)
2804 return PyUnicode_DecodeASCII(s, size, errors);
2805 else if (strcmp(lower, "utf-16") == 0)
2806 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2807 else if (strcmp(lower, "utf-32") == 0)
2808 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2809 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002810
2811 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002812 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002813 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002814 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002815 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002816 if (buffer == NULL)
2817 goto onError;
2818 unicode = PyCodec_Decode(buffer, encoding, errors);
2819 if (unicode == NULL)
2820 goto onError;
2821 if (!PyUnicode_Check(unicode)) {
2822 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002823 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002824 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002825 Py_DECREF(unicode);
2826 goto onError;
2827 }
2828 Py_DECREF(buffer);
Victor Stinner17efeed2011-10-04 20:05:46 +02002829#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02002830 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002831 Py_DECREF(unicode);
2832 return NULL;
2833 }
Victor Stinner17efeed2011-10-04 20:05:46 +02002834#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002835 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002836 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002837
Benjamin Peterson29060642009-01-31 22:14:21 +00002838 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002839 Py_XDECREF(buffer);
2840 return NULL;
2841}
2842
Alexander Belopolsky40018472011-02-26 01:02:56 +00002843PyObject *
2844PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002845 const char *encoding,
2846 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002847{
2848 PyObject *v;
2849
2850 if (!PyUnicode_Check(unicode)) {
2851 PyErr_BadArgument();
2852 goto onError;
2853 }
2854
2855 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002856 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002857
2858 /* Decode via the codec registry */
2859 v = PyCodec_Decode(unicode, encoding, errors);
2860 if (v == NULL)
2861 goto onError;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002862 assert(_PyUnicode_CheckConsistency(v, 1));
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002863 return v;
2864
Benjamin Peterson29060642009-01-31 22:14:21 +00002865 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002866 return NULL;
2867}
2868
Alexander Belopolsky40018472011-02-26 01:02:56 +00002869PyObject *
2870PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002871 const char *encoding,
2872 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002873{
2874 PyObject *v;
2875
2876 if (!PyUnicode_Check(unicode)) {
2877 PyErr_BadArgument();
2878 goto onError;
2879 }
2880
2881 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002882 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002883
2884 /* Decode via the codec registry */
2885 v = PyCodec_Decode(unicode, encoding, errors);
2886 if (v == NULL)
2887 goto onError;
2888 if (!PyUnicode_Check(v)) {
2889 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002890 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002891 Py_TYPE(v)->tp_name);
2892 Py_DECREF(v);
2893 goto onError;
2894 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002895 assert(_PyUnicode_CheckConsistency(v, 1));
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002896 return v;
2897
Benjamin Peterson29060642009-01-31 22:14:21 +00002898 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002899 return NULL;
2900}
2901
Alexander Belopolsky40018472011-02-26 01:02:56 +00002902PyObject *
2903PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002904 Py_ssize_t size,
2905 const char *encoding,
2906 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002907{
2908 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002909
Guido van Rossumd57fd912000-03-10 22:53:23 +00002910 unicode = PyUnicode_FromUnicode(s, size);
2911 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002912 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002913 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2914 Py_DECREF(unicode);
2915 return v;
2916}
2917
Alexander Belopolsky40018472011-02-26 01:02:56 +00002918PyObject *
2919PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002920 const char *encoding,
2921 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002922{
2923 PyObject *v;
2924
2925 if (!PyUnicode_Check(unicode)) {
2926 PyErr_BadArgument();
2927 goto onError;
2928 }
2929
2930 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002931 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002932
2933 /* Encode via the codec registry */
2934 v = PyCodec_Encode(unicode, encoding, errors);
2935 if (v == NULL)
2936 goto onError;
2937 return v;
2938
Benjamin Peterson29060642009-01-31 22:14:21 +00002939 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002940 return NULL;
2941}
2942
Victor Stinnerad158722010-10-27 00:25:46 +00002943PyObject *
2944PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002945{
Victor Stinner99b95382011-07-04 14:23:54 +02002946#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002947 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2948 PyUnicode_GET_SIZE(unicode),
2949 NULL);
2950#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002951 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002952#else
Victor Stinner793b5312011-04-27 00:24:21 +02002953 PyInterpreterState *interp = PyThreadState_GET()->interp;
2954 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2955 cannot use it to encode and decode filenames before it is loaded. Load
2956 the Python codec requires to encode at least its own filename. Use the C
2957 version of the locale codec until the codec registry is initialized and
2958 the Python codec is loaded.
2959
2960 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2961 cannot only rely on it: check also interp->fscodec_initialized for
2962 subinterpreters. */
2963 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00002964 return PyUnicode_AsEncodedString(unicode,
2965 Py_FileSystemDefaultEncoding,
2966 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00002967 }
2968 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002969 /* locale encoding with surrogateescape */
2970 wchar_t *wchar;
2971 char *bytes;
2972 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00002973 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002974
2975 wchar = PyUnicode_AsWideCharString(unicode, NULL);
2976 if (wchar == NULL)
2977 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002978 bytes = _Py_wchar2char(wchar, &error_pos);
2979 if (bytes == NULL) {
2980 if (error_pos != (size_t)-1) {
2981 char *errmsg = strerror(errno);
2982 PyObject *exc = NULL;
2983 if (errmsg == NULL)
2984 errmsg = "Py_wchar2char() failed";
2985 raise_encode_exception(&exc,
2986 "filesystemencoding",
2987 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2988 error_pos, error_pos+1,
2989 errmsg);
2990 Py_XDECREF(exc);
2991 }
2992 else
2993 PyErr_NoMemory();
2994 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002995 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002996 }
2997 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002998
2999 bytes_obj = PyBytes_FromString(bytes);
3000 PyMem_Free(bytes);
3001 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00003002 }
Victor Stinnerad158722010-10-27 00:25:46 +00003003#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003004}
3005
Alexander Belopolsky40018472011-02-26 01:02:56 +00003006PyObject *
3007PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003008 const char *encoding,
3009 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003010{
3011 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003012 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003013
Guido van Rossumd57fd912000-03-10 22:53:23 +00003014 if (!PyUnicode_Check(unicode)) {
3015 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003016 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003017 }
Fred Drakee4315f52000-05-09 19:53:39 +00003018
Victor Stinner2f283c22011-03-02 01:21:46 +00003019 if (encoding == NULL) {
3020 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003021 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003022 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003023 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00003024 }
Fred Drakee4315f52000-05-09 19:53:39 +00003025
3026 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003027 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003028 if ((strcmp(lower, "utf-8") == 0) ||
3029 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003030 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003031 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003032 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003033 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003034 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003035 }
Victor Stinner37296e82010-06-10 13:36:23 +00003036 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003037 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003038 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003039 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003040#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003041 else if (strcmp(lower, "mbcs") == 0)
3042 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3043 PyUnicode_GET_SIZE(unicode),
3044 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003045#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003046 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003047 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003048 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003049
3050 /* Encode via the codec registry */
3051 v = PyCodec_Encode(unicode, encoding, errors);
3052 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003053 return NULL;
3054
3055 /* The normal path */
3056 if (PyBytes_Check(v))
3057 return v;
3058
3059 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003060 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003061 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003062 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003063
3064 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3065 "encoder %s returned bytearray instead of bytes",
3066 encoding);
3067 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003068 Py_DECREF(v);
3069 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003070 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003071
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003072 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3073 Py_DECREF(v);
3074 return b;
3075 }
3076
3077 PyErr_Format(PyExc_TypeError,
3078 "encoder did not return a bytes object (type=%.400s)",
3079 Py_TYPE(v)->tp_name);
3080 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003081 return NULL;
3082}
3083
Alexander Belopolsky40018472011-02-26 01:02:56 +00003084PyObject *
3085PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003086 const char *encoding,
3087 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003088{
3089 PyObject *v;
3090
3091 if (!PyUnicode_Check(unicode)) {
3092 PyErr_BadArgument();
3093 goto onError;
3094 }
3095
3096 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003097 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003098
3099 /* Encode via the codec registry */
3100 v = PyCodec_Encode(unicode, encoding, errors);
3101 if (v == NULL)
3102 goto onError;
3103 if (!PyUnicode_Check(v)) {
3104 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003105 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003106 Py_TYPE(v)->tp_name);
3107 Py_DECREF(v);
3108 goto onError;
3109 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003110 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003111
Benjamin Peterson29060642009-01-31 22:14:21 +00003112 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003113 return NULL;
3114}
3115
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003116PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003117PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003118 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003119 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3120}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003121
Christian Heimes5894ba72007-11-04 11:43:14 +00003122PyObject*
3123PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3124{
Victor Stinner99b95382011-07-04 14:23:54 +02003125#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003126 return PyUnicode_DecodeMBCS(s, size, NULL);
3127#elif defined(__APPLE__)
3128 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
3129#else
Victor Stinner793b5312011-04-27 00:24:21 +02003130 PyInterpreterState *interp = PyThreadState_GET()->interp;
3131 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3132 cannot use it to encode and decode filenames before it is loaded. Load
3133 the Python codec requires to encode at least its own filename. Use the C
3134 version of the locale codec until the codec registry is initialized and
3135 the Python codec is loaded.
3136
3137 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3138 cannot only rely on it: check also interp->fscodec_initialized for
3139 subinterpreters. */
3140 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003141 return PyUnicode_Decode(s, size,
3142 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003143 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003144 }
3145 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003146 /* locale encoding with surrogateescape */
3147 wchar_t *wchar;
3148 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00003149 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003150
3151 if (s[size] != '\0' || size != strlen(s)) {
3152 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3153 return NULL;
3154 }
3155
Victor Stinner168e1172010-10-16 23:16:16 +00003156 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003157 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00003158 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003159
Victor Stinner168e1172010-10-16 23:16:16 +00003160 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003161 PyMem_Free(wchar);
3162 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003163 }
Victor Stinnerad158722010-10-27 00:25:46 +00003164#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003165}
3166
Martin v. Löwis011e8422009-05-05 04:43:17 +00003167
3168int
3169PyUnicode_FSConverter(PyObject* arg, void* addr)
3170{
3171 PyObject *output = NULL;
3172 Py_ssize_t size;
3173 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003174 if (arg == NULL) {
3175 Py_DECREF(*(PyObject**)addr);
3176 return 1;
3177 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003178 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003179 output = arg;
3180 Py_INCREF(output);
3181 }
3182 else {
3183 arg = PyUnicode_FromObject(arg);
3184 if (!arg)
3185 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003186 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003187 Py_DECREF(arg);
3188 if (!output)
3189 return 0;
3190 if (!PyBytes_Check(output)) {
3191 Py_DECREF(output);
3192 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3193 return 0;
3194 }
3195 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003196 size = PyBytes_GET_SIZE(output);
3197 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003198 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003199 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003200 Py_DECREF(output);
3201 return 0;
3202 }
3203 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003204 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003205}
3206
3207
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003208int
3209PyUnicode_FSDecoder(PyObject* arg, void* addr)
3210{
3211 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003212 if (arg == NULL) {
3213 Py_DECREF(*(PyObject**)addr);
3214 return 1;
3215 }
3216 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003217 if (PyUnicode_READY(arg))
3218 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003219 output = arg;
3220 Py_INCREF(output);
3221 }
3222 else {
3223 arg = PyBytes_FromObject(arg);
3224 if (!arg)
3225 return 0;
3226 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3227 PyBytes_GET_SIZE(arg));
3228 Py_DECREF(arg);
3229 if (!output)
3230 return 0;
3231 if (!PyUnicode_Check(output)) {
3232 Py_DECREF(output);
3233 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3234 return 0;
3235 }
3236 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003237 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3238 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003239 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3240 Py_DECREF(output);
3241 return 0;
3242 }
3243 *(PyObject**)addr = output;
3244 return Py_CLEANUP_SUPPORTED;
3245}
3246
3247
Martin v. Löwis5b222132007-06-10 09:51:05 +00003248char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003249PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003250{
Christian Heimesf3863112007-11-22 07:46:41 +00003251 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003252 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
3253
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003254 if (!PyUnicode_Check(unicode)) {
3255 PyErr_BadArgument();
3256 return NULL;
3257 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003258 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003259 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003260
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003261 if (PyUnicode_UTF8(unicode) == NULL) {
3262 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003263 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3264 if (bytes == NULL)
3265 return NULL;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003266 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3267 if (_PyUnicode_UTF8(u) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003268 Py_DECREF(bytes);
3269 return NULL;
3270 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003271 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
3272 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003273 Py_DECREF(bytes);
3274 }
3275
3276 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003277 *psize = PyUnicode_UTF8_LENGTH(unicode);
3278 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003279}
3280
3281char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003282PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003283{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003284 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3285}
3286
3287#ifdef Py_DEBUG
3288int unicode_as_unicode_calls = 0;
3289#endif
3290
3291
3292Py_UNICODE *
3293PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3294{
3295 PyUnicodeObject *u;
3296 const unsigned char *one_byte;
3297#if SIZEOF_WCHAR_T == 4
3298 const Py_UCS2 *two_bytes;
3299#else
3300 const Py_UCS4 *four_bytes;
3301 const Py_UCS4 *ucs4_end;
3302 Py_ssize_t num_surrogates;
3303#endif
3304 wchar_t *w;
3305 wchar_t *wchar_end;
3306
3307 if (!PyUnicode_Check(unicode)) {
3308 PyErr_BadArgument();
3309 return NULL;
3310 }
3311 u = (PyUnicodeObject*)unicode;
3312 if (_PyUnicode_WSTR(u) == NULL) {
3313 /* Non-ASCII compact unicode object */
3314 assert(_PyUnicode_KIND(u) != 0);
3315 assert(PyUnicode_IS_READY(u));
3316
3317#ifdef Py_DEBUG
3318 ++unicode_as_unicode_calls;
3319#endif
3320
3321 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
3322#if SIZEOF_WCHAR_T == 2
3323 four_bytes = PyUnicode_4BYTE_DATA(u);
3324 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
3325 num_surrogates = 0;
3326
3327 for (; four_bytes < ucs4_end; ++four_bytes) {
3328 if (*four_bytes > 0xFFFF)
3329 ++num_surrogates;
3330 }
3331
3332 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
3333 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
3334 if (!_PyUnicode_WSTR(u)) {
3335 PyErr_NoMemory();
3336 return NULL;
3337 }
3338 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
3339
3340 w = _PyUnicode_WSTR(u);
3341 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
3342 four_bytes = PyUnicode_4BYTE_DATA(u);
3343 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3344 if (*four_bytes > 0xFFFF) {
3345 /* encode surrogate pair in this case */
3346 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3347 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3348 }
3349 else
3350 *w = *four_bytes;
3351
3352 if (w > wchar_end) {
3353 assert(0 && "Miscalculated string end");
3354 }
3355 }
3356 *w = 0;
3357#else
3358 /* sizeof(wchar_t) == 4 */
3359 Py_FatalError("Impossible unicode object state, wstr and str "
3360 "should share memory already.");
3361 return NULL;
3362#endif
3363 }
3364 else {
3365 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3366 (_PyUnicode_LENGTH(u) + 1));
3367 if (!_PyUnicode_WSTR(u)) {
3368 PyErr_NoMemory();
3369 return NULL;
3370 }
3371 if (!PyUnicode_IS_COMPACT_ASCII(u))
3372 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
3373 w = _PyUnicode_WSTR(u);
3374 wchar_end = w + _PyUnicode_LENGTH(u);
3375
3376 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
3377 one_byte = PyUnicode_1BYTE_DATA(u);
3378 for (; w < wchar_end; ++one_byte, ++w)
3379 *w = *one_byte;
3380 /* null-terminate the wstr */
3381 *w = 0;
3382 }
3383 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
3384#if SIZEOF_WCHAR_T == 4
3385 two_bytes = PyUnicode_2BYTE_DATA(u);
3386 for (; w < wchar_end; ++two_bytes, ++w)
3387 *w = *two_bytes;
3388 /* null-terminate the wstr */
3389 *w = 0;
3390#else
3391 /* sizeof(wchar_t) == 2 */
3392 PyObject_FREE(_PyUnicode_WSTR(u));
3393 _PyUnicode_WSTR(u) = NULL;
3394 Py_FatalError("Impossible unicode object state, wstr "
3395 "and str should share memory already.");
3396 return NULL;
3397#endif
3398 }
3399 else {
3400 assert(0 && "This should never happen.");
3401 }
3402 }
3403 }
3404 if (size != NULL)
3405 *size = PyUnicode_WSTR_LENGTH(u);
3406 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003407}
3408
Alexander Belopolsky40018472011-02-26 01:02:56 +00003409Py_UNICODE *
3410PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003411{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003412 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003413}
3414
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003415
Alexander Belopolsky40018472011-02-26 01:02:56 +00003416Py_ssize_t
3417PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003418{
3419 if (!PyUnicode_Check(unicode)) {
3420 PyErr_BadArgument();
3421 goto onError;
3422 }
3423 return PyUnicode_GET_SIZE(unicode);
3424
Benjamin Peterson29060642009-01-31 22:14:21 +00003425 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003426 return -1;
3427}
3428
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003429Py_ssize_t
3430PyUnicode_GetLength(PyObject *unicode)
3431{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003432 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003433 PyErr_BadArgument();
3434 return -1;
3435 }
3436
3437 return PyUnicode_GET_LENGTH(unicode);
3438}
3439
3440Py_UCS4
3441PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3442{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003443 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3444 PyErr_BadArgument();
3445 return (Py_UCS4)-1;
3446 }
3447 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3448 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003449 return (Py_UCS4)-1;
3450 }
3451 return PyUnicode_READ_CHAR(unicode, index);
3452}
3453
3454int
3455PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3456{
3457 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003458 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003459 return -1;
3460 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003461 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3462 PyErr_SetString(PyExc_IndexError, "string index out of range");
3463 return -1;
3464 }
3465 if (_PyUnicode_Dirty(unicode))
3466 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003467 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3468 index, ch);
3469 return 0;
3470}
3471
Alexander Belopolsky40018472011-02-26 01:02:56 +00003472const char *
3473PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003474{
Victor Stinner42cb4622010-09-01 19:39:01 +00003475 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003476}
3477
Victor Stinner554f3f02010-06-16 23:33:54 +00003478/* create or adjust a UnicodeDecodeError */
3479static void
3480make_decode_exception(PyObject **exceptionObject,
3481 const char *encoding,
3482 const char *input, Py_ssize_t length,
3483 Py_ssize_t startpos, Py_ssize_t endpos,
3484 const char *reason)
3485{
3486 if (*exceptionObject == NULL) {
3487 *exceptionObject = PyUnicodeDecodeError_Create(
3488 encoding, input, length, startpos, endpos, reason);
3489 }
3490 else {
3491 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3492 goto onError;
3493 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3494 goto onError;
3495 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3496 goto onError;
3497 }
3498 return;
3499
3500onError:
3501 Py_DECREF(*exceptionObject);
3502 *exceptionObject = NULL;
3503}
3504
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003505/* error handling callback helper:
3506 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003507 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003508 and adjust various state variables.
3509 return 0 on success, -1 on error
3510*/
3511
Alexander Belopolsky40018472011-02-26 01:02:56 +00003512static int
3513unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003514 const char *encoding, const char *reason,
3515 const char **input, const char **inend, Py_ssize_t *startinpos,
3516 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3517 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003518{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003519 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003520
3521 PyObject *restuple = NULL;
3522 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003523 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003524 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003525 Py_ssize_t requiredsize;
3526 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003527 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003528 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003529 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003530 int res = -1;
3531
3532 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003533 *errorHandler = PyCodec_LookupError(errors);
3534 if (*errorHandler == NULL)
3535 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003536 }
3537
Victor Stinner554f3f02010-06-16 23:33:54 +00003538 make_decode_exception(exceptionObject,
3539 encoding,
3540 *input, *inend - *input,
3541 *startinpos, *endinpos,
3542 reason);
3543 if (*exceptionObject == NULL)
3544 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003545
3546 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3547 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003548 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003549 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003550 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003551 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003552 }
3553 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003554 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003555
3556 /* Copy back the bytes variables, which might have been modified by the
3557 callback */
3558 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3559 if (!inputobj)
3560 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003561 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003562 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003563 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003564 *input = PyBytes_AS_STRING(inputobj);
3565 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003566 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003567 /* we can DECREF safely, as the exception has another reference,
3568 so the object won't go away. */
3569 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003570
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003571 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003572 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003573 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003574 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3575 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003576 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003577
3578 /* need more space? (at least enough for what we
3579 have+the replacement+the rest of the string (starting
3580 at the new input position), so we won't have to check space
3581 when there are no errors in the rest of the string) */
3582 repptr = PyUnicode_AS_UNICODE(repunicode);
3583 repsize = PyUnicode_GET_SIZE(repunicode);
3584 requiredsize = *outpos + repsize + insize-newpos;
3585 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003586 if (requiredsize<2*outsize)
3587 requiredsize = 2*outsize;
Victor Stinnerfe226c02011-10-03 03:52:20 +02003588 if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003589 goto onError;
3590 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003591 }
3592 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003593 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003594 Py_UNICODE_COPY(*outptr, repptr, repsize);
3595 *outptr += repsize;
3596 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003597
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003598 /* we made it! */
3599 res = 0;
3600
Benjamin Peterson29060642009-01-31 22:14:21 +00003601 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003602 Py_XDECREF(restuple);
3603 return res;
3604}
3605
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003606/* --- UTF-7 Codec -------------------------------------------------------- */
3607
Antoine Pitrou244651a2009-05-04 18:56:13 +00003608/* See RFC2152 for details. We encode conservatively and decode liberally. */
3609
3610/* Three simple macros defining base-64. */
3611
3612/* Is c a base-64 character? */
3613
3614#define IS_BASE64(c) \
3615 (((c) >= 'A' && (c) <= 'Z') || \
3616 ((c) >= 'a' && (c) <= 'z') || \
3617 ((c) >= '0' && (c) <= '9') || \
3618 (c) == '+' || (c) == '/')
3619
3620/* given that c is a base-64 character, what is its base-64 value? */
3621
3622#define FROM_BASE64(c) \
3623 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3624 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3625 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3626 (c) == '+' ? 62 : 63)
3627
3628/* What is the base-64 character of the bottom 6 bits of n? */
3629
3630#define TO_BASE64(n) \
3631 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3632
3633/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3634 * decoded as itself. We are permissive on decoding; the only ASCII
3635 * byte not decoding to itself is the + which begins a base64
3636 * string. */
3637
3638#define DECODE_DIRECT(c) \
3639 ((c) <= 127 && (c) != '+')
3640
3641/* The UTF-7 encoder treats ASCII characters differently according to
3642 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3643 * the above). See RFC2152. This array identifies these different
3644 * sets:
3645 * 0 : "Set D"
3646 * alphanumeric and '(),-./:?
3647 * 1 : "Set O"
3648 * !"#$%&*;<=>@[]^_`{|}
3649 * 2 : "whitespace"
3650 * ht nl cr sp
3651 * 3 : special (must be base64 encoded)
3652 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3653 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003654
Tim Petersced69f82003-09-16 20:30:58 +00003655static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003656char utf7_category[128] = {
3657/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3658 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3659/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3660 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3661/* sp ! " # $ % & ' ( ) * + , - . / */
3662 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3663/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3664 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3665/* @ A B C D E F G H I J K L M N O */
3666 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3667/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3668 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3669/* ` a b c d e f g h i j k l m n o */
3670 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3671/* p q r s t u v w x y z { | } ~ del */
3672 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003673};
3674
Antoine Pitrou244651a2009-05-04 18:56:13 +00003675/* ENCODE_DIRECT: this character should be encoded as itself. The
3676 * answer depends on whether we are encoding set O as itself, and also
3677 * on whether we are encoding whitespace as itself. RFC2152 makes it
3678 * clear that the answers to these questions vary between
3679 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003680
Antoine Pitrou244651a2009-05-04 18:56:13 +00003681#define ENCODE_DIRECT(c, directO, directWS) \
3682 ((c) < 128 && (c) > 0 && \
3683 ((utf7_category[(c)] == 0) || \
3684 (directWS && (utf7_category[(c)] == 2)) || \
3685 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003686
Alexander Belopolsky40018472011-02-26 01:02:56 +00003687PyObject *
3688PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003689 Py_ssize_t size,
3690 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003691{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003692 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3693}
3694
Antoine Pitrou244651a2009-05-04 18:56:13 +00003695/* The decoder. The only state we preserve is our read position,
3696 * i.e. how many characters we have consumed. So if we end in the
3697 * middle of a shift sequence we have to back off the read position
3698 * and the output to the beginning of the sequence, otherwise we lose
3699 * all the shift state (seen bits, number of bits seen, high
3700 * surrogate). */
3701
Alexander Belopolsky40018472011-02-26 01:02:56 +00003702PyObject *
3703PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003704 Py_ssize_t size,
3705 const char *errors,
3706 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003707{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003708 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003709 Py_ssize_t startinpos;
3710 Py_ssize_t endinpos;
3711 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003712 const char *e;
3713 PyUnicodeObject *unicode;
3714 Py_UNICODE *p;
3715 const char *errmsg = "";
3716 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003717 Py_UNICODE *shiftOutStart;
3718 unsigned int base64bits = 0;
3719 unsigned long base64buffer = 0;
3720 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003721 PyObject *errorHandler = NULL;
3722 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003723
3724 unicode = _PyUnicode_New(size);
3725 if (!unicode)
3726 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003727 if (size == 0) {
3728 if (consumed)
3729 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003730 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003731 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003732
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003733 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003734 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003735 e = s + size;
3736
3737 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003738 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003739 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003740 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003741
Antoine Pitrou244651a2009-05-04 18:56:13 +00003742 if (inShift) { /* in a base-64 section */
3743 if (IS_BASE64(ch)) { /* consume a base-64 character */
3744 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3745 base64bits += 6;
3746 s++;
3747 if (base64bits >= 16) {
3748 /* we have enough bits for a UTF-16 value */
3749 Py_UNICODE outCh = (Py_UNICODE)
3750 (base64buffer >> (base64bits-16));
3751 base64bits -= 16;
3752 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3753 if (surrogate) {
3754 /* expecting a second surrogate */
3755 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3756#ifdef Py_UNICODE_WIDE
3757 *p++ = (((surrogate & 0x3FF)<<10)
3758 | (outCh & 0x3FF)) + 0x10000;
3759#else
3760 *p++ = surrogate;
3761 *p++ = outCh;
3762#endif
3763 surrogate = 0;
3764 }
3765 else {
3766 surrogate = 0;
3767 errmsg = "second surrogate missing";
3768 goto utf7Error;
3769 }
3770 }
3771 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3772 /* first surrogate */
3773 surrogate = outCh;
3774 }
3775 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3776 errmsg = "unexpected second surrogate";
3777 goto utf7Error;
3778 }
3779 else {
3780 *p++ = outCh;
3781 }
3782 }
3783 }
3784 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003785 inShift = 0;
3786 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003787 if (surrogate) {
3788 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003789 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003790 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003791 if (base64bits > 0) { /* left-over bits */
3792 if (base64bits >= 6) {
3793 /* We've seen at least one base-64 character */
3794 errmsg = "partial character in shift sequence";
3795 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003796 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003797 else {
3798 /* Some bits remain; they should be zero */
3799 if (base64buffer != 0) {
3800 errmsg = "non-zero padding bits in shift sequence";
3801 goto utf7Error;
3802 }
3803 }
3804 }
3805 if (ch != '-') {
3806 /* '-' is absorbed; other terminating
3807 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003808 *p++ = ch;
3809 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003810 }
3811 }
3812 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003813 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003814 s++; /* consume '+' */
3815 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003816 s++;
3817 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003818 }
3819 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003820 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003821 shiftOutStart = p;
3822 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003823 }
3824 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003825 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003826 *p++ = ch;
3827 s++;
3828 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003829 else {
3830 startinpos = s-starts;
3831 s++;
3832 errmsg = "unexpected special character";
3833 goto utf7Error;
3834 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003835 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003836utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003837 outpos = p-PyUnicode_AS_UNICODE(unicode);
3838 endinpos = s-starts;
3839 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003840 errors, &errorHandler,
3841 "utf7", errmsg,
3842 &starts, &e, &startinpos, &endinpos, &exc, &s,
3843 &unicode, &outpos, &p))
3844 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003845 }
3846
Antoine Pitrou244651a2009-05-04 18:56:13 +00003847 /* end of string */
3848
3849 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3850 /* if we're in an inconsistent state, that's an error */
3851 if (surrogate ||
3852 (base64bits >= 6) ||
3853 (base64bits > 0 && base64buffer != 0)) {
3854 outpos = p-PyUnicode_AS_UNICODE(unicode);
3855 endinpos = size;
3856 if (unicode_decode_call_errorhandler(
3857 errors, &errorHandler,
3858 "utf7", "unterminated shift sequence",
3859 &starts, &e, &startinpos, &endinpos, &exc, &s,
3860 &unicode, &outpos, &p))
3861 goto onError;
3862 if (s < e)
3863 goto restart;
3864 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003865 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003866
3867 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003868 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003869 if (inShift) {
3870 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003871 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003872 }
3873 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003874 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003875 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003876 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003877
Victor Stinnerfe226c02011-10-03 03:52:20 +02003878 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003879 goto onError;
3880
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003881 Py_XDECREF(errorHandler);
3882 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02003883#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02003884 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003885 Py_DECREF(unicode);
3886 return NULL;
3887 }
Victor Stinner17efeed2011-10-04 20:05:46 +02003888#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02003889 assert(_PyUnicode_CheckConsistency(unicode, 1));
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003890 return (PyObject *)unicode;
3891
Benjamin Peterson29060642009-01-31 22:14:21 +00003892 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003893 Py_XDECREF(errorHandler);
3894 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003895 Py_DECREF(unicode);
3896 return NULL;
3897}
3898
3899
Alexander Belopolsky40018472011-02-26 01:02:56 +00003900PyObject *
3901PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003902 Py_ssize_t size,
3903 int base64SetO,
3904 int base64WhiteSpace,
3905 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003906{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003907 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003908 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003909 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003910 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003911 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003912 unsigned int base64bits = 0;
3913 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003914 char * out;
3915 char * start;
3916
3917 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003918 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003919
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003920 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003921 return PyErr_NoMemory();
3922
Antoine Pitrou244651a2009-05-04 18:56:13 +00003923 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003924 if (v == NULL)
3925 return NULL;
3926
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003927 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003928 for (;i < size; ++i) {
3929 Py_UNICODE ch = s[i];
3930
Antoine Pitrou244651a2009-05-04 18:56:13 +00003931 if (inShift) {
3932 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3933 /* shifting out */
3934 if (base64bits) { /* output remaining bits */
3935 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3936 base64buffer = 0;
3937 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003938 }
3939 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003940 /* Characters not in the BASE64 set implicitly unshift the sequence
3941 so no '-' is required, except if the character is itself a '-' */
3942 if (IS_BASE64(ch) || ch == '-') {
3943 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003944 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003945 *out++ = (char) ch;
3946 }
3947 else {
3948 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003949 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003950 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003951 else { /* not in a shift sequence */
3952 if (ch == '+') {
3953 *out++ = '+';
3954 *out++ = '-';
3955 }
3956 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3957 *out++ = (char) ch;
3958 }
3959 else {
3960 *out++ = '+';
3961 inShift = 1;
3962 goto encode_char;
3963 }
3964 }
3965 continue;
3966encode_char:
3967#ifdef Py_UNICODE_WIDE
3968 if (ch >= 0x10000) {
3969 /* code first surrogate */
3970 base64bits += 16;
3971 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3972 while (base64bits >= 6) {
3973 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3974 base64bits -= 6;
3975 }
3976 /* prepare second surrogate */
3977 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
3978 }
3979#endif
3980 base64bits += 16;
3981 base64buffer = (base64buffer << 16) | ch;
3982 while (base64bits >= 6) {
3983 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3984 base64bits -= 6;
3985 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00003986 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003987 if (base64bits)
3988 *out++= TO_BASE64(base64buffer << (6-base64bits) );
3989 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003990 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003991 if (_PyBytes_Resize(&v, out - start) < 0)
3992 return NULL;
3993 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003994}
3995
Antoine Pitrou244651a2009-05-04 18:56:13 +00003996#undef IS_BASE64
3997#undef FROM_BASE64
3998#undef TO_BASE64
3999#undef DECODE_DIRECT
4000#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004001
Guido van Rossumd57fd912000-03-10 22:53:23 +00004002/* --- UTF-8 Codec -------------------------------------------------------- */
4003
Tim Petersced69f82003-09-16 20:30:58 +00004004static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004005char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004006 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4007 illegal prefix. See RFC 3629 for details */
4008 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4009 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004010 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004011 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4012 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4013 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4014 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004015 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4016 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004017 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4018 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004019 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4020 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4021 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4022 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4023 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004024};
4025
Alexander Belopolsky40018472011-02-26 01:02:56 +00004026PyObject *
4027PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004028 Py_ssize_t size,
4029 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004030{
Walter Dörwald69652032004-09-07 20:24:22 +00004031 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4032}
4033
Antoine Pitrouab868312009-01-10 15:40:25 +00004034/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4035#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4036
4037/* Mask to quickly check whether a C 'long' contains a
4038 non-ASCII, UTF8-encoded char. */
4039#if (SIZEOF_LONG == 8)
4040# define ASCII_CHAR_MASK 0x8080808080808080L
4041#elif (SIZEOF_LONG == 4)
4042# define ASCII_CHAR_MASK 0x80808080L
4043#else
4044# error C 'long' size should be either 4 or 8!
4045#endif
4046
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004047/* Scans a UTF-8 string and returns the maximum character to be expected,
4048 the size of the decoded unicode string and if any major errors were
4049 encountered.
4050
4051 This function does check basic UTF-8 sanity, it does however NOT CHECK
4052 if the string contains surrogates, and if all continuation bytes are
4053 within the correct ranges, these checks are performed in
4054 PyUnicode_DecodeUTF8Stateful.
4055
4056 If it sets has_errors to 1, it means the value of unicode_size and max_char
4057 will be bogus and you should not rely on useful information in them.
4058 */
4059static Py_UCS4
4060utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
4061 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
4062 int *has_errors)
4063{
4064 Py_ssize_t n;
4065 Py_ssize_t char_count = 0;
4066 Py_UCS4 max_char = 127, new_max;
4067 Py_UCS4 upper_bound;
4068 const unsigned char *p = (const unsigned char *)s;
4069 const unsigned char *end = p + string_size;
4070 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
4071 int err = 0;
4072
4073 for (; p < end && !err; ++p, ++char_count) {
4074 /* Only check value if it's not a ASCII char... */
4075 if (*p < 0x80) {
4076 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4077 an explanation. */
4078 if (!((size_t) p & LONG_PTR_MASK)) {
4079 /* Help register allocation */
4080 register const unsigned char *_p = p;
4081 while (_p < aligned_end) {
4082 unsigned long value = *(unsigned long *) _p;
4083 if (value & ASCII_CHAR_MASK)
4084 break;
4085 _p += SIZEOF_LONG;
4086 char_count += SIZEOF_LONG;
4087 }
4088 p = _p;
4089 if (p == end)
4090 break;
4091 }
4092 }
4093 if (*p >= 0x80) {
4094 n = utf8_code_length[*p];
4095 new_max = max_char;
4096 switch (n) {
4097 /* invalid start byte */
4098 case 0:
4099 err = 1;
4100 break;
4101 case 2:
4102 /* Code points between 0x00FF and 0x07FF inclusive.
4103 Approximate the upper bound of the code point,
4104 if this flips over 255 we can be sure it will be more
4105 than 255 and the string will need 2 bytes per code coint,
4106 if it stays under or equal to 255, we can be sure 1 byte
4107 is enough.
4108 ((*p & 0b00011111) << 6) | 0b00111111 */
4109 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
4110 if (max_char < upper_bound)
4111 new_max = upper_bound;
4112 /* Ensure we track at least that we left ASCII space. */
4113 if (new_max < 128)
4114 new_max = 128;
4115 break;
4116 case 3:
4117 /* Between 0x0FFF and 0xFFFF inclusive, so values are
4118 always > 255 and <= 65535 and will always need 2 bytes. */
4119 if (max_char < 65535)
4120 new_max = 65535;
4121 break;
4122 case 4:
4123 /* Code point will be above 0xFFFF for sure in this case. */
4124 new_max = 65537;
4125 break;
4126 /* Internal error, this should be caught by the first if */
4127 case 1:
4128 default:
4129 assert(0 && "Impossible case in utf8_max_char_and_size");
4130 err = 1;
4131 }
4132 /* Instead of number of overall bytes for this code point,
Georg Brandl7597add2011-10-05 16:36:47 +02004133 n contains the number of following bytes: */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004134 --n;
4135 /* Check if the follow up chars are all valid continuation bytes */
4136 if (n >= 1) {
4137 const unsigned char *cont;
4138 if ((p + n) >= end) {
4139 if (consumed == 0)
4140 /* incomplete data, non-incremental decoding */
4141 err = 1;
4142 break;
4143 }
4144 for (cont = p + 1; cont < (p + n); ++cont) {
4145 if ((*cont & 0xc0) != 0x80) {
4146 err = 1;
4147 break;
4148 }
4149 }
4150 p += n;
4151 }
4152 else
4153 err = 1;
4154 max_char = new_max;
4155 }
4156 }
4157
4158 if (unicode_size)
4159 *unicode_size = char_count;
4160 if (has_errors)
4161 *has_errors = err;
4162 return max_char;
4163}
4164
4165/* Similar to PyUnicode_WRITE but can also write into wstr field
4166 of the legacy unicode representation */
4167#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
4168 do { \
4169 const int k_ = (kind); \
4170 if (k_ == PyUnicode_WCHAR_KIND) \
4171 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
4172 else if (k_ == PyUnicode_1BYTE_KIND) \
4173 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
4174 else if (k_ == PyUnicode_2BYTE_KIND) \
4175 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
4176 else \
4177 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
4178 } while (0)
4179
Alexander Belopolsky40018472011-02-26 01:02:56 +00004180PyObject *
4181PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004182 Py_ssize_t size,
4183 const char *errors,
4184 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004185{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004186 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004187 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004188 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004189 Py_ssize_t startinpos;
4190 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00004191 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004192 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004193 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004194 PyObject *errorHandler = NULL;
4195 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004196 Py_UCS4 maxchar = 0;
4197 Py_ssize_t unicode_size;
4198 Py_ssize_t i;
4199 int kind;
4200 void *data;
4201 int has_errors;
4202 Py_UNICODE *error_outptr;
4203#if SIZEOF_WCHAR_T == 2
4204 Py_ssize_t wchar_offset = 0;
4205#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004206
Walter Dörwald69652032004-09-07 20:24:22 +00004207 if (size == 0) {
4208 if (consumed)
4209 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004210 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00004211 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004212 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
4213 consumed, &has_errors);
4214 if (has_errors) {
4215 unicode = _PyUnicode_New(size);
4216 if (!unicode)
4217 return NULL;
4218 kind = PyUnicode_WCHAR_KIND;
4219 data = PyUnicode_AS_UNICODE(unicode);
4220 assert(data != NULL);
4221 }
4222 else {
4223 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
4224 if (!unicode)
4225 return NULL;
4226 /* When the string is ASCII only, just use memcpy and return.
4227 unicode_size may be != size if there is an incomplete UTF-8
4228 sequence at the end of the ASCII block. */
4229 if (maxchar < 128 && size == unicode_size) {
4230 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
4231 return (PyObject *)unicode;
4232 }
4233 kind = PyUnicode_KIND(unicode);
4234 data = PyUnicode_DATA(unicode);
4235 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004236 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004237 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004238 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00004239 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004240
4241 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004242 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004243
4244 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004245 /* Fast path for runs of ASCII characters. Given that common UTF-8
4246 input will consist of an overwhelming majority of ASCII
4247 characters, we try to optimize for this case by checking
4248 as many characters as a C 'long' can contain.
4249 First, check if we can do an aligned read, as most CPUs have
4250 a penalty for unaligned reads.
4251 */
4252 if (!((size_t) s & LONG_PTR_MASK)) {
4253 /* Help register allocation */
4254 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004255 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004256 while (_s < aligned_end) {
4257 /* Read a whole long at a time (either 4 or 8 bytes),
4258 and do a fast unrolled copy if it only contains ASCII
4259 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004260 unsigned long value = *(unsigned long *) _s;
4261 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004262 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004263 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
4264 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
4265 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
4266 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004267#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004268 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
4269 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
4270 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
4271 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004272#endif
4273 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004274 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004275 }
4276 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004277 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004278 if (s == e)
4279 break;
4280 ch = (unsigned char)*s;
4281 }
4282 }
4283
4284 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004285 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004286 s++;
4287 continue;
4288 }
4289
4290 n = utf8_code_length[ch];
4291
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004292 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004293 if (consumed)
4294 break;
4295 else {
4296 errmsg = "unexpected end of data";
4297 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004298 endinpos = startinpos+1;
4299 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4300 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004301 goto utf8Error;
4302 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004303 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004304
4305 switch (n) {
4306
4307 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004308 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004309 startinpos = s-starts;
4310 endinpos = startinpos+1;
4311 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004312
4313 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004314 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004315 startinpos = s-starts;
4316 endinpos = startinpos+1;
4317 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004318
4319 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004320 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004321 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004322 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004323 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004324 goto utf8Error;
4325 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004326 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004327 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004328 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004329 break;
4330
4331 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004332 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4333 will result in surrogates in range d800-dfff. Surrogates are
4334 not valid UTF-8 so they are rejected.
4335 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4336 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004337 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004338 (s[2] & 0xc0) != 0x80 ||
4339 ((unsigned char)s[0] == 0xE0 &&
4340 (unsigned char)s[1] < 0xA0) ||
4341 ((unsigned char)s[0] == 0xED &&
4342 (unsigned char)s[1] > 0x9F)) {
4343 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004344 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004345 endinpos = startinpos + 1;
4346
4347 /* if s[1] first two bits are 1 and 0, then the invalid
4348 continuation byte is s[2], so increment endinpos by 1,
4349 if not, s[1] is invalid and endinpos doesn't need to
4350 be incremented. */
4351 if ((s[1] & 0xC0) == 0x80)
4352 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004353 goto utf8Error;
4354 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004355 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004356 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004357 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004358 break;
4359
4360 case 4:
4361 if ((s[1] & 0xc0) != 0x80 ||
4362 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004363 (s[3] & 0xc0) != 0x80 ||
4364 ((unsigned char)s[0] == 0xF0 &&
4365 (unsigned char)s[1] < 0x90) ||
4366 ((unsigned char)s[0] == 0xF4 &&
4367 (unsigned char)s[1] > 0x8F)) {
4368 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004369 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004370 endinpos = startinpos + 1;
4371 if ((s[1] & 0xC0) == 0x80) {
4372 endinpos++;
4373 if ((s[2] & 0xC0) == 0x80)
4374 endinpos++;
4375 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004376 goto utf8Error;
4377 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004378 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004379 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4380 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4381
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004382 /* If the string is flexible or we have native UCS-4, write
4383 directly.. */
4384 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
4385 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00004386
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004387 else {
4388 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00004389
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004390 /* translate from 10000..10FFFF to 0..FFFF */
4391 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00004392
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004393 /* high surrogate = top 10 bits added to D800 */
4394 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4395 (Py_UNICODE)(0xD800 + (ch >> 10)));
4396
4397 /* low surrogate = bottom 10 bits added to DC00 */
4398 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4399 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
4400 }
4401#if SIZEOF_WCHAR_T == 2
4402 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004403#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004404 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004405 }
4406 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004407 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004408
Benjamin Peterson29060642009-01-31 22:14:21 +00004409 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004410 /* If this is not yet a resizable string, make it one.. */
4411 if (kind != PyUnicode_WCHAR_KIND) {
4412 const Py_UNICODE *u;
4413 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
4414 if (!new_unicode)
4415 goto onError;
4416 u = PyUnicode_AsUnicode((PyObject *)unicode);
4417 if (!u)
4418 goto onError;
4419#if SIZEOF_WCHAR_T == 2
4420 i += wchar_offset;
4421#endif
4422 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
4423 Py_DECREF(unicode);
4424 unicode = new_unicode;
4425 kind = 0;
4426 data = PyUnicode_AS_UNICODE(new_unicode);
4427 assert(data != NULL);
4428 }
4429 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00004430 if (unicode_decode_call_errorhandler(
4431 errors, &errorHandler,
4432 "utf8", errmsg,
4433 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004434 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00004435 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004436 /* Update data because unicode_decode_call_errorhandler might have
4437 re-created or resized the unicode object. */
4438 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004439 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004440 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004441 /* Ensure the unicode_size calculation above was correct: */
4442 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
4443
Walter Dörwald69652032004-09-07 20:24:22 +00004444 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004445 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004446
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004447 /* Adjust length and ready string when it contained errors and
4448 is of the old resizable kind. */
4449 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004450 if (PyUnicode_Resize((PyObject**)&unicode, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004451 goto onError;
4452 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004453
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004454 Py_XDECREF(errorHandler);
4455 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02004456#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004457 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004458 Py_DECREF(unicode);
4459 return NULL;
4460 }
Victor Stinner17efeed2011-10-04 20:05:46 +02004461#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004462 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004463 return (PyObject *)unicode;
4464
Benjamin Peterson29060642009-01-31 22:14:21 +00004465 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004466 Py_XDECREF(errorHandler);
4467 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004468 Py_DECREF(unicode);
4469 return NULL;
4470}
4471
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004472#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00004473
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004474#ifdef __APPLE__
4475
4476/* Simplified UTF-8 decoder using surrogateescape error handler,
4477 used to decode the command line arguments on Mac OS X. */
4478
4479wchar_t*
4480_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4481{
4482 int n;
4483 const char *e;
4484 wchar_t *unicode, *p;
4485
4486 /* Note: size will always be longer than the resulting Unicode
4487 character count */
4488 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4489 PyErr_NoMemory();
4490 return NULL;
4491 }
4492 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4493 if (!unicode)
4494 return NULL;
4495
4496 /* Unpack UTF-8 encoded data */
4497 p = unicode;
4498 e = s + size;
4499 while (s < e) {
4500 Py_UCS4 ch = (unsigned char)*s;
4501
4502 if (ch < 0x80) {
4503 *p++ = (wchar_t)ch;
4504 s++;
4505 continue;
4506 }
4507
4508 n = utf8_code_length[ch];
4509 if (s + n > e) {
4510 goto surrogateescape;
4511 }
4512
4513 switch (n) {
4514 case 0:
4515 case 1:
4516 goto surrogateescape;
4517
4518 case 2:
4519 if ((s[1] & 0xc0) != 0x80)
4520 goto surrogateescape;
4521 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4522 assert ((ch > 0x007F) && (ch <= 0x07FF));
4523 *p++ = (wchar_t)ch;
4524 break;
4525
4526 case 3:
4527 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4528 will result in surrogates in range d800-dfff. Surrogates are
4529 not valid UTF-8 so they are rejected.
4530 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4531 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4532 if ((s[1] & 0xc0) != 0x80 ||
4533 (s[2] & 0xc0) != 0x80 ||
4534 ((unsigned char)s[0] == 0xE0 &&
4535 (unsigned char)s[1] < 0xA0) ||
4536 ((unsigned char)s[0] == 0xED &&
4537 (unsigned char)s[1] > 0x9F)) {
4538
4539 goto surrogateescape;
4540 }
4541 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4542 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004543 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004544 break;
4545
4546 case 4:
4547 if ((s[1] & 0xc0) != 0x80 ||
4548 (s[2] & 0xc0) != 0x80 ||
4549 (s[3] & 0xc0) != 0x80 ||
4550 ((unsigned char)s[0] == 0xF0 &&
4551 (unsigned char)s[1] < 0x90) ||
4552 ((unsigned char)s[0] == 0xF4 &&
4553 (unsigned char)s[1] > 0x8F)) {
4554 goto surrogateescape;
4555 }
4556 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4557 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4558 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4559
4560#if SIZEOF_WCHAR_T == 4
4561 *p++ = (wchar_t)ch;
4562#else
4563 /* compute and append the two surrogates: */
4564
4565 /* translate from 10000..10FFFF to 0..FFFF */
4566 ch -= 0x10000;
4567
4568 /* high surrogate = top 10 bits added to D800 */
4569 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4570
4571 /* low surrogate = bottom 10 bits added to DC00 */
4572 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4573#endif
4574 break;
4575 }
4576 s += n;
4577 continue;
4578
4579 surrogateescape:
4580 *p++ = 0xDC00 + ch;
4581 s++;
4582 }
4583 *p = L'\0';
4584 return unicode;
4585}
4586
4587#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004588
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004589/* Primary internal function which creates utf8 encoded bytes objects.
4590
4591 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004592 and allocate exactly as much space needed at the end. Else allocate the
4593 maximum possible needed (4 result bytes per Unicode character), and return
4594 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004595*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004596PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004597_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004598{
Tim Peters602f7402002-04-27 18:03:26 +00004599#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004600
Guido van Rossum98297ee2007-11-06 21:34:58 +00004601 Py_ssize_t i; /* index into s of next input byte */
4602 PyObject *result; /* result string object */
4603 char *p; /* next free byte in output buffer */
4604 Py_ssize_t nallocated; /* number of result bytes allocated */
4605 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004606 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004607 PyObject *errorHandler = NULL;
4608 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004609 int kind;
4610 void *data;
4611 Py_ssize_t size;
4612 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4613#if SIZEOF_WCHAR_T == 2
4614 Py_ssize_t wchar_offset = 0;
4615#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004616
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004617 if (!PyUnicode_Check(unicode)) {
4618 PyErr_BadArgument();
4619 return NULL;
4620 }
4621
4622 if (PyUnicode_READY(unicode) == -1)
4623 return NULL;
4624
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004625 if (PyUnicode_UTF8(unicode))
4626 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4627 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004628
4629 kind = PyUnicode_KIND(unicode);
4630 data = PyUnicode_DATA(unicode);
4631 size = PyUnicode_GET_LENGTH(unicode);
4632
Tim Peters602f7402002-04-27 18:03:26 +00004633 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004634
Tim Peters602f7402002-04-27 18:03:26 +00004635 if (size <= MAX_SHORT_UNICHARS) {
4636 /* Write into the stack buffer; nallocated can't overflow.
4637 * At the end, we'll allocate exactly as much heap space as it
4638 * turns out we need.
4639 */
4640 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004641 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004642 p = stackbuf;
4643 }
4644 else {
4645 /* Overallocate on the heap, and give the excess back at the end. */
4646 nallocated = size * 4;
4647 if (nallocated / 4 != size) /* overflow! */
4648 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004649 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004650 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004651 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004652 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004653 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004654
Tim Peters602f7402002-04-27 18:03:26 +00004655 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004656 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004657
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004658 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004659 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004660 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004661
Guido van Rossumd57fd912000-03-10 22:53:23 +00004662 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004663 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004664 *p++ = (char)(0xc0 | (ch >> 6));
4665 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004666 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004667 Py_ssize_t newpos;
4668 PyObject *rep;
4669 Py_ssize_t repsize, k, startpos;
4670 startpos = i-1;
4671#if SIZEOF_WCHAR_T == 2
4672 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004673#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004674 rep = unicode_encode_call_errorhandler(
4675 errors, &errorHandler, "utf-8", "surrogates not allowed",
4676 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4677 &exc, startpos, startpos+1, &newpos);
4678 if (!rep)
4679 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004680
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004681 if (PyBytes_Check(rep))
4682 repsize = PyBytes_GET_SIZE(rep);
4683 else
4684 repsize = PyUnicode_GET_SIZE(rep);
4685
4686 if (repsize > 4) {
4687 Py_ssize_t offset;
4688
4689 if (result == NULL)
4690 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004691 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004692 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004693
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004694 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4695 /* integer overflow */
4696 PyErr_NoMemory();
4697 goto error;
4698 }
4699 nallocated += repsize - 4;
4700 if (result != NULL) {
4701 if (_PyBytes_Resize(&result, nallocated) < 0)
4702 goto error;
4703 } else {
4704 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004705 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004706 goto error;
4707 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4708 }
4709 p = PyBytes_AS_STRING(result) + offset;
4710 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004711
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004712 if (PyBytes_Check(rep)) {
4713 char *prep = PyBytes_AS_STRING(rep);
4714 for(k = repsize; k > 0; k--)
4715 *p++ = *prep++;
4716 } else /* rep is unicode */ {
4717 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4718 Py_UNICODE c;
4719
4720 for(k=0; k<repsize; k++) {
4721 c = prep[k];
4722 if (0x80 <= c) {
4723 raise_encode_exception(&exc, "utf-8",
4724 PyUnicode_AS_UNICODE(unicode),
4725 size, i-1, i,
4726 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004727 goto error;
4728 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004729 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004730 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004731 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004732 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004733 } else if (ch < 0x10000) {
4734 *p++ = (char)(0xe0 | (ch >> 12));
4735 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4736 *p++ = (char)(0x80 | (ch & 0x3f));
4737 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004738 /* Encode UCS4 Unicode ordinals */
4739 *p++ = (char)(0xf0 | (ch >> 18));
4740 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4741 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4742 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004743#if SIZEOF_WCHAR_T == 2
4744 wchar_offset++;
4745#endif
Tim Peters602f7402002-04-27 18:03:26 +00004746 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004747 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004748
Guido van Rossum98297ee2007-11-06 21:34:58 +00004749 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004750 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004751 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004752 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004753 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004754 }
4755 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004756 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004757 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004758 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004759 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004760 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004761
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004762 Py_XDECREF(errorHandler);
4763 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004764 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004765 error:
4766 Py_XDECREF(errorHandler);
4767 Py_XDECREF(exc);
4768 Py_XDECREF(result);
4769 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004770
Tim Peters602f7402002-04-27 18:03:26 +00004771#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004772}
4773
Alexander Belopolsky40018472011-02-26 01:02:56 +00004774PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004775PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4776 Py_ssize_t size,
4777 const char *errors)
4778{
4779 PyObject *v, *unicode;
4780
4781 unicode = PyUnicode_FromUnicode(s, size);
4782 if (unicode == NULL)
4783 return NULL;
4784 v = _PyUnicode_AsUTF8String(unicode, errors);
4785 Py_DECREF(unicode);
4786 return v;
4787}
4788
4789PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004790PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004791{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004792 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004793}
4794
Walter Dörwald41980ca2007-08-16 21:55:45 +00004795/* --- UTF-32 Codec ------------------------------------------------------- */
4796
4797PyObject *
4798PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004799 Py_ssize_t size,
4800 const char *errors,
4801 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004802{
4803 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4804}
4805
4806PyObject *
4807PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004808 Py_ssize_t size,
4809 const char *errors,
4810 int *byteorder,
4811 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004812{
4813 const char *starts = s;
4814 Py_ssize_t startinpos;
4815 Py_ssize_t endinpos;
4816 Py_ssize_t outpos;
4817 PyUnicodeObject *unicode;
4818 Py_UNICODE *p;
4819#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004820 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004821 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004822#else
4823 const int pairs = 0;
4824#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004825 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004826 int bo = 0; /* assume native ordering by default */
4827 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004828 /* Offsets from q for retrieving bytes in the right order. */
4829#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4830 int iorder[] = {0, 1, 2, 3};
4831#else
4832 int iorder[] = {3, 2, 1, 0};
4833#endif
4834 PyObject *errorHandler = NULL;
4835 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004836
Walter Dörwald41980ca2007-08-16 21:55:45 +00004837 q = (unsigned char *)s;
4838 e = q + size;
4839
4840 if (byteorder)
4841 bo = *byteorder;
4842
4843 /* Check for BOM marks (U+FEFF) in the input and adjust current
4844 byte order setting accordingly. In native mode, the leading BOM
4845 mark is skipped, in all other modes, it is copied to the output
4846 stream as-is (giving a ZWNBSP character). */
4847 if (bo == 0) {
4848 if (size >= 4) {
4849 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004850 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004851#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004852 if (bom == 0x0000FEFF) {
4853 q += 4;
4854 bo = -1;
4855 }
4856 else if (bom == 0xFFFE0000) {
4857 q += 4;
4858 bo = 1;
4859 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004860#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004861 if (bom == 0x0000FEFF) {
4862 q += 4;
4863 bo = 1;
4864 }
4865 else if (bom == 0xFFFE0000) {
4866 q += 4;
4867 bo = -1;
4868 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004869#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004870 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004871 }
4872
4873 if (bo == -1) {
4874 /* force LE */
4875 iorder[0] = 0;
4876 iorder[1] = 1;
4877 iorder[2] = 2;
4878 iorder[3] = 3;
4879 }
4880 else if (bo == 1) {
4881 /* force BE */
4882 iorder[0] = 3;
4883 iorder[1] = 2;
4884 iorder[2] = 1;
4885 iorder[3] = 0;
4886 }
4887
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004888 /* On narrow builds we split characters outside the BMP into two
4889 codepoints => count how much extra space we need. */
4890#ifndef Py_UNICODE_WIDE
4891 for (qq = q; qq < e; qq += 4)
4892 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4893 pairs++;
4894#endif
4895
4896 /* This might be one to much, because of a BOM */
4897 unicode = _PyUnicode_New((size+3)/4+pairs);
4898 if (!unicode)
4899 return NULL;
4900 if (size == 0)
4901 return (PyObject *)unicode;
4902
4903 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004904 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004905
Walter Dörwald41980ca2007-08-16 21:55:45 +00004906 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004907 Py_UCS4 ch;
4908 /* remaining bytes at the end? (size should be divisible by 4) */
4909 if (e-q<4) {
4910 if (consumed)
4911 break;
4912 errmsg = "truncated data";
4913 startinpos = ((const char *)q)-starts;
4914 endinpos = ((const char *)e)-starts;
4915 goto utf32Error;
4916 /* The remaining input chars are ignored if the callback
4917 chooses to skip the input */
4918 }
4919 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4920 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004921
Benjamin Peterson29060642009-01-31 22:14:21 +00004922 if (ch >= 0x110000)
4923 {
4924 errmsg = "codepoint not in range(0x110000)";
4925 startinpos = ((const char *)q)-starts;
4926 endinpos = startinpos+4;
4927 goto utf32Error;
4928 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004929#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004930 if (ch >= 0x10000)
4931 {
4932 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4933 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4934 }
4935 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004936#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004937 *p++ = ch;
4938 q += 4;
4939 continue;
4940 utf32Error:
4941 outpos = p-PyUnicode_AS_UNICODE(unicode);
4942 if (unicode_decode_call_errorhandler(
4943 errors, &errorHandler,
4944 "utf32", errmsg,
4945 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4946 &unicode, &outpos, &p))
4947 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004948 }
4949
4950 if (byteorder)
4951 *byteorder = bo;
4952
4953 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004954 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004955
4956 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02004957 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004958 goto onError;
4959
4960 Py_XDECREF(errorHandler);
4961 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02004962#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004963 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004964 Py_DECREF(unicode);
4965 return NULL;
4966 }
Victor Stinner17efeed2011-10-04 20:05:46 +02004967#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004968 assert(_PyUnicode_CheckConsistency(unicode, 1));
Walter Dörwald41980ca2007-08-16 21:55:45 +00004969 return (PyObject *)unicode;
4970
Benjamin Peterson29060642009-01-31 22:14:21 +00004971 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004972 Py_DECREF(unicode);
4973 Py_XDECREF(errorHandler);
4974 Py_XDECREF(exc);
4975 return NULL;
4976}
4977
4978PyObject *
4979PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004980 Py_ssize_t size,
4981 const char *errors,
4982 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004983{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004984 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004985 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004986 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004987#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004988 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004989#else
4990 const int pairs = 0;
4991#endif
4992 /* Offsets from p for storing byte pairs in the right order. */
4993#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4994 int iorder[] = {0, 1, 2, 3};
4995#else
4996 int iorder[] = {3, 2, 1, 0};
4997#endif
4998
Benjamin Peterson29060642009-01-31 22:14:21 +00004999#define STORECHAR(CH) \
5000 do { \
5001 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5002 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5003 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5004 p[iorder[0]] = (CH) & 0xff; \
5005 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005006 } while(0)
5007
5008 /* In narrow builds we can output surrogate pairs as one codepoint,
5009 so we need less space. */
5010#ifndef Py_UNICODE_WIDE
5011 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005012 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
5013 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
5014 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005015#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005016 nsize = (size - pairs + (byteorder == 0));
5017 bytesize = nsize * 4;
5018 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005019 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005020 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005021 if (v == NULL)
5022 return NULL;
5023
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005024 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005025 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005026 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005027 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005028 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005029
5030 if (byteorder == -1) {
5031 /* force LE */
5032 iorder[0] = 0;
5033 iorder[1] = 1;
5034 iorder[2] = 2;
5035 iorder[3] = 3;
5036 }
5037 else if (byteorder == 1) {
5038 /* force BE */
5039 iorder[0] = 3;
5040 iorder[1] = 2;
5041 iorder[2] = 1;
5042 iorder[3] = 0;
5043 }
5044
5045 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005046 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005047#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005048 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
5049 Py_UCS4 ch2 = *s;
5050 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
5051 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
5052 s++;
5053 size--;
5054 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005055 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005056#endif
5057 STORECHAR(ch);
5058 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005059
5060 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005061 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005062#undef STORECHAR
5063}
5064
Alexander Belopolsky40018472011-02-26 01:02:56 +00005065PyObject *
5066PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005067{
5068 if (!PyUnicode_Check(unicode)) {
5069 PyErr_BadArgument();
5070 return NULL;
5071 }
5072 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005073 PyUnicode_GET_SIZE(unicode),
5074 NULL,
5075 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005076}
5077
Guido van Rossumd57fd912000-03-10 22:53:23 +00005078/* --- UTF-16 Codec ------------------------------------------------------- */
5079
Tim Peters772747b2001-08-09 22:21:55 +00005080PyObject *
5081PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005082 Py_ssize_t size,
5083 const char *errors,
5084 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005085{
Walter Dörwald69652032004-09-07 20:24:22 +00005086 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5087}
5088
Antoine Pitrouab868312009-01-10 15:40:25 +00005089/* Two masks for fast checking of whether a C 'long' may contain
5090 UTF16-encoded surrogate characters. This is an efficient heuristic,
5091 assuming that non-surrogate characters with a code point >= 0x8000 are
5092 rare in most input.
5093 FAST_CHAR_MASK is used when the input is in native byte ordering,
5094 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005095*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005096#if (SIZEOF_LONG == 8)
5097# define FAST_CHAR_MASK 0x8000800080008000L
5098# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
5099#elif (SIZEOF_LONG == 4)
5100# define FAST_CHAR_MASK 0x80008000L
5101# define SWAPPED_FAST_CHAR_MASK 0x00800080L
5102#else
5103# error C 'long' size should be either 4 or 8!
5104#endif
5105
Walter Dörwald69652032004-09-07 20:24:22 +00005106PyObject *
5107PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005108 Py_ssize_t size,
5109 const char *errors,
5110 int *byteorder,
5111 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005112{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005113 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005114 Py_ssize_t startinpos;
5115 Py_ssize_t endinpos;
5116 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005117 PyUnicodeObject *unicode;
5118 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00005119 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005120 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005121 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005122 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005123 /* Offsets from q for retrieving byte pairs in the right order. */
5124#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5125 int ihi = 1, ilo = 0;
5126#else
5127 int ihi = 0, ilo = 1;
5128#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005129 PyObject *errorHandler = NULL;
5130 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005131
5132 /* Note: size will always be longer than the resulting Unicode
5133 character count */
5134 unicode = _PyUnicode_New(size);
5135 if (!unicode)
5136 return NULL;
5137 if (size == 0)
5138 return (PyObject *)unicode;
5139
5140 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005141 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00005142 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005143 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005144
5145 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005146 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005147
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005148 /* Check for BOM marks (U+FEFF) in the input and adjust current
5149 byte order setting accordingly. In native mode, the leading BOM
5150 mark is skipped, in all other modes, it is copied to the output
5151 stream as-is (giving a ZWNBSP character). */
5152 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005153 if (size >= 2) {
5154 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005155#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005156 if (bom == 0xFEFF) {
5157 q += 2;
5158 bo = -1;
5159 }
5160 else if (bom == 0xFFFE) {
5161 q += 2;
5162 bo = 1;
5163 }
Tim Petersced69f82003-09-16 20:30:58 +00005164#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005165 if (bom == 0xFEFF) {
5166 q += 2;
5167 bo = 1;
5168 }
5169 else if (bom == 0xFFFE) {
5170 q += 2;
5171 bo = -1;
5172 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005173#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005174 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005175 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005176
Tim Peters772747b2001-08-09 22:21:55 +00005177 if (bo == -1) {
5178 /* force LE */
5179 ihi = 1;
5180 ilo = 0;
5181 }
5182 else if (bo == 1) {
5183 /* force BE */
5184 ihi = 0;
5185 ilo = 1;
5186 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005187#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5188 native_ordering = ilo < ihi;
5189#else
5190 native_ordering = ilo > ihi;
5191#endif
Tim Peters772747b2001-08-09 22:21:55 +00005192
Antoine Pitrouab868312009-01-10 15:40:25 +00005193 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005194 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005195 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005196 /* First check for possible aligned read of a C 'long'. Unaligned
5197 reads are more expensive, better to defer to another iteration. */
5198 if (!((size_t) q & LONG_PTR_MASK)) {
5199 /* Fast path for runs of non-surrogate chars. */
5200 register const unsigned char *_q = q;
5201 Py_UNICODE *_p = p;
5202 if (native_ordering) {
5203 /* Native ordering is simple: as long as the input cannot
5204 possibly contain a surrogate char, do an unrolled copy
5205 of several 16-bit code points to the target object.
5206 The non-surrogate check is done on several input bytes
5207 at a time (as many as a C 'long' can contain). */
5208 while (_q < aligned_end) {
5209 unsigned long data = * (unsigned long *) _q;
5210 if (data & FAST_CHAR_MASK)
5211 break;
5212 _p[0] = ((unsigned short *) _q)[0];
5213 _p[1] = ((unsigned short *) _q)[1];
5214#if (SIZEOF_LONG == 8)
5215 _p[2] = ((unsigned short *) _q)[2];
5216 _p[3] = ((unsigned short *) _q)[3];
5217#endif
5218 _q += SIZEOF_LONG;
5219 _p += SIZEOF_LONG / 2;
5220 }
5221 }
5222 else {
5223 /* Byteswapped ordering is similar, but we must decompose
5224 the copy bytewise, and take care of zero'ing out the
5225 upper bytes if the target object is in 32-bit units
5226 (that is, in UCS-4 builds). */
5227 while (_q < aligned_end) {
5228 unsigned long data = * (unsigned long *) _q;
5229 if (data & SWAPPED_FAST_CHAR_MASK)
5230 break;
5231 /* Zero upper bytes in UCS-4 builds */
5232#if (Py_UNICODE_SIZE > 2)
5233 _p[0] = 0;
5234 _p[1] = 0;
5235#if (SIZEOF_LONG == 8)
5236 _p[2] = 0;
5237 _p[3] = 0;
5238#endif
5239#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005240 /* Issue #4916; UCS-4 builds on big endian machines must
5241 fill the two last bytes of each 4-byte unit. */
5242#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
5243# define OFF 2
5244#else
5245# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00005246#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005247 ((unsigned char *) _p)[OFF + 1] = _q[0];
5248 ((unsigned char *) _p)[OFF + 0] = _q[1];
5249 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
5250 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
5251#if (SIZEOF_LONG == 8)
5252 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
5253 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
5254 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
5255 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
5256#endif
5257#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00005258 _q += SIZEOF_LONG;
5259 _p += SIZEOF_LONG / 2;
5260 }
5261 }
5262 p = _p;
5263 q = _q;
5264 if (q >= e)
5265 break;
5266 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005267 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005268
Benjamin Peterson14339b62009-01-31 16:36:08 +00005269 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005270
5271 if (ch < 0xD800 || ch > 0xDFFF) {
5272 *p++ = ch;
5273 continue;
5274 }
5275
5276 /* UTF-16 code pair: */
5277 if (q > e) {
5278 errmsg = "unexpected end of data";
5279 startinpos = (((const char *)q) - 2) - starts;
5280 endinpos = ((const char *)e) + 1 - starts;
5281 goto utf16Error;
5282 }
5283 if (0xD800 <= ch && ch <= 0xDBFF) {
5284 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
5285 q += 2;
5286 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00005287#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005288 *p++ = ch;
5289 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005290#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005291 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005292#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005293 continue;
5294 }
5295 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005296 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005297 startinpos = (((const char *)q)-4)-starts;
5298 endinpos = startinpos+2;
5299 goto utf16Error;
5300 }
5301
Benjamin Peterson14339b62009-01-31 16:36:08 +00005302 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005303 errmsg = "illegal encoding";
5304 startinpos = (((const char *)q)-2)-starts;
5305 endinpos = startinpos+2;
5306 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005307
Benjamin Peterson29060642009-01-31 22:14:21 +00005308 utf16Error:
5309 outpos = p - PyUnicode_AS_UNICODE(unicode);
5310 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005311 errors,
5312 &errorHandler,
5313 "utf16", errmsg,
5314 &starts,
5315 (const char **)&e,
5316 &startinpos,
5317 &endinpos,
5318 &exc,
5319 (const char **)&q,
5320 &unicode,
5321 &outpos,
5322 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00005323 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005324 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005325 /* remaining byte at the end? (size should be even) */
5326 if (e == q) {
5327 if (!consumed) {
5328 errmsg = "truncated data";
5329 startinpos = ((const char *)q) - starts;
5330 endinpos = ((const char *)e) + 1 - starts;
5331 outpos = p - PyUnicode_AS_UNICODE(unicode);
5332 if (unicode_decode_call_errorhandler(
5333 errors,
5334 &errorHandler,
5335 "utf16", errmsg,
5336 &starts,
5337 (const char **)&e,
5338 &startinpos,
5339 &endinpos,
5340 &exc,
5341 (const char **)&q,
5342 &unicode,
5343 &outpos,
5344 &p))
5345 goto onError;
5346 /* The remaining input chars are ignored if the callback
5347 chooses to skip the input */
5348 }
5349 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005350
5351 if (byteorder)
5352 *byteorder = bo;
5353
Walter Dörwald69652032004-09-07 20:24:22 +00005354 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005355 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005356
Guido van Rossumd57fd912000-03-10 22:53:23 +00005357 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02005358 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005359 goto onError;
5360
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005361 Py_XDECREF(errorHandler);
5362 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005363#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005364 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005365 Py_DECREF(unicode);
5366 return NULL;
5367 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005368#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005369 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005370 return (PyObject *)unicode;
5371
Benjamin Peterson29060642009-01-31 22:14:21 +00005372 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005373 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005374 Py_XDECREF(errorHandler);
5375 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005376 return NULL;
5377}
5378
Antoine Pitrouab868312009-01-10 15:40:25 +00005379#undef FAST_CHAR_MASK
5380#undef SWAPPED_FAST_CHAR_MASK
5381
Tim Peters772747b2001-08-09 22:21:55 +00005382PyObject *
5383PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005384 Py_ssize_t size,
5385 const char *errors,
5386 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005387{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005388 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005389 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005390 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005391#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005392 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005393#else
5394 const int pairs = 0;
5395#endif
Tim Peters772747b2001-08-09 22:21:55 +00005396 /* Offsets from p for storing byte pairs in the right order. */
5397#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5398 int ihi = 1, ilo = 0;
5399#else
5400 int ihi = 0, ilo = 1;
5401#endif
5402
Benjamin Peterson29060642009-01-31 22:14:21 +00005403#define STORECHAR(CH) \
5404 do { \
5405 p[ihi] = ((CH) >> 8) & 0xff; \
5406 p[ilo] = (CH) & 0xff; \
5407 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005408 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005409
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005410#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005411 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005412 if (s[i] >= 0x10000)
5413 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005414#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005415 /* 2 * (size + pairs + (byteorder == 0)) */
5416 if (size > PY_SSIZE_T_MAX ||
5417 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005418 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005419 nsize = size + pairs + (byteorder == 0);
5420 bytesize = nsize * 2;
5421 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005422 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005423 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005424 if (v == NULL)
5425 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005426
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005427 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005428 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005429 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005430 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005431 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005432
5433 if (byteorder == -1) {
5434 /* force LE */
5435 ihi = 1;
5436 ilo = 0;
5437 }
5438 else if (byteorder == 1) {
5439 /* force BE */
5440 ihi = 0;
5441 ilo = 1;
5442 }
5443
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005444 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005445 Py_UNICODE ch = *s++;
5446 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005447#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005448 if (ch >= 0x10000) {
5449 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5450 ch = 0xD800 | ((ch-0x10000) >> 10);
5451 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005452#endif
Tim Peters772747b2001-08-09 22:21:55 +00005453 STORECHAR(ch);
5454 if (ch2)
5455 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005456 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005457
5458 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005459 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005460#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005461}
5462
Alexander Belopolsky40018472011-02-26 01:02:56 +00005463PyObject *
5464PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005465{
5466 if (!PyUnicode_Check(unicode)) {
5467 PyErr_BadArgument();
5468 return NULL;
5469 }
5470 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005471 PyUnicode_GET_SIZE(unicode),
5472 NULL,
5473 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005474}
5475
5476/* --- Unicode Escape Codec ----------------------------------------------- */
5477
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005478/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5479 if all the escapes in the string make it still a valid ASCII string.
5480 Returns -1 if any escapes were found which cause the string to
5481 pop out of ASCII range. Otherwise returns the length of the
5482 required buffer to hold the string.
5483 */
5484Py_ssize_t
5485length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5486{
5487 const unsigned char *p = (const unsigned char *)s;
5488 const unsigned char *end = p + size;
5489 Py_ssize_t length = 0;
5490
5491 if (size < 0)
5492 return -1;
5493
5494 for (; p < end; ++p) {
5495 if (*p > 127) {
5496 /* Non-ASCII */
5497 return -1;
5498 }
5499 else if (*p != '\\') {
5500 /* Normal character */
5501 ++length;
5502 }
5503 else {
5504 /* Backslash-escape, check next char */
5505 ++p;
5506 /* Escape sequence reaches till end of string or
5507 non-ASCII follow-up. */
5508 if (p >= end || *p > 127)
5509 return -1;
5510 switch (*p) {
5511 case '\n':
5512 /* backslash + \n result in zero characters */
5513 break;
5514 case '\\': case '\'': case '\"':
5515 case 'b': case 'f': case 't':
5516 case 'n': case 'r': case 'v': case 'a':
5517 ++length;
5518 break;
5519 case '0': case '1': case '2': case '3':
5520 case '4': case '5': case '6': case '7':
5521 case 'x': case 'u': case 'U': case 'N':
5522 /* these do not guarantee ASCII characters */
5523 return -1;
5524 default:
5525 /* count the backslash + the other character */
5526 length += 2;
5527 }
5528 }
5529 }
5530 return length;
5531}
5532
5533/* Similar to PyUnicode_WRITE but either write into wstr field
5534 or treat string as ASCII. */
5535#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5536 do { \
5537 if ((kind) != PyUnicode_WCHAR_KIND) \
5538 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5539 else \
5540 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5541 } while (0)
5542
5543#define WRITE_WSTR(buf, index, value) \
5544 assert(kind == PyUnicode_WCHAR_KIND), \
5545 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5546
5547
Fredrik Lundh06d12682001-01-24 07:59:11 +00005548static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005549
Alexander Belopolsky40018472011-02-26 01:02:56 +00005550PyObject *
5551PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005552 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005553 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005554{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005555 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005556 Py_ssize_t startinpos;
5557 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005558 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005559 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005560 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005561 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005562 char* message;
5563 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005564 PyObject *errorHandler = NULL;
5565 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005566 Py_ssize_t ascii_length;
5567 Py_ssize_t i;
5568 int kind;
5569 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005570
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005571 ascii_length = length_of_escaped_ascii_string(s, size);
5572
5573 /* After length_of_escaped_ascii_string() there are two alternatives,
5574 either the string is pure ASCII with named escapes like \n, etc.
5575 and we determined it's exact size (common case)
5576 or it contains \x, \u, ... escape sequences. then we create a
5577 legacy wchar string and resize it at the end of this function. */
5578 if (ascii_length >= 0) {
5579 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
5580 if (!v)
5581 goto onError;
5582 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5583 kind = PyUnicode_1BYTE_KIND;
5584 data = PyUnicode_DATA(v);
5585 }
5586 else {
5587 /* Escaped strings will always be longer than the resulting
5588 Unicode string, so we start with size here and then reduce the
5589 length after conversion to the true value.
5590 (but if the error callback returns a long replacement string
5591 we'll have to allocate more space) */
5592 v = _PyUnicode_New(size);
5593 if (!v)
5594 goto onError;
5595 kind = PyUnicode_WCHAR_KIND;
5596 data = PyUnicode_AS_UNICODE(v);
5597 }
5598
Guido van Rossumd57fd912000-03-10 22:53:23 +00005599 if (size == 0)
5600 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005601 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005602 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005603
Guido van Rossumd57fd912000-03-10 22:53:23 +00005604 while (s < end) {
5605 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005606 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005607 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005608
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005609 if (kind == PyUnicode_WCHAR_KIND) {
5610 assert(i < _PyUnicode_WSTR_LENGTH(v));
5611 }
5612 else {
5613 /* The only case in which i == ascii_length is a backslash
5614 followed by a newline. */
5615 assert(i <= ascii_length);
5616 }
5617
Guido van Rossumd57fd912000-03-10 22:53:23 +00005618 /* Non-escape characters are interpreted as Unicode ordinals */
5619 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005620 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005621 continue;
5622 }
5623
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005624 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005625 /* \ - Escapes */
5626 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005627 c = *s++;
5628 if (s > end)
5629 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005630
5631 if (kind == PyUnicode_WCHAR_KIND) {
5632 assert(i < _PyUnicode_WSTR_LENGTH(v));
5633 }
5634 else {
5635 /* The only case in which i == ascii_length is a backslash
5636 followed by a newline. */
5637 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5638 }
5639
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005640 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005641
Benjamin Peterson29060642009-01-31 22:14:21 +00005642 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005643 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005644 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5645 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5646 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5647 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5648 /* FF */
5649 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5650 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5651 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5652 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5653 /* VT */
5654 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5655 /* BEL, not classic C */
5656 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005657
Benjamin Peterson29060642009-01-31 22:14:21 +00005658 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005659 case '0': case '1': case '2': case '3':
5660 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005661 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005662 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005663 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005664 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005665 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005666 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005667 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005668 break;
5669
Benjamin Peterson29060642009-01-31 22:14:21 +00005670 /* hex escapes */
5671 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005672 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005673 digits = 2;
5674 message = "truncated \\xXX escape";
5675 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005676
Benjamin Peterson29060642009-01-31 22:14:21 +00005677 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005678 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005679 digits = 4;
5680 message = "truncated \\uXXXX escape";
5681 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005682
Benjamin Peterson29060642009-01-31 22:14:21 +00005683 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005684 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005685 digits = 8;
5686 message = "truncated \\UXXXXXXXX escape";
5687 hexescape:
5688 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005689 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005690 if (s+digits>end) {
5691 endinpos = size;
5692 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005693 errors, &errorHandler,
5694 "unicodeescape", "end of string in escape sequence",
5695 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005696 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005697 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005698 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005699 goto nextByte;
5700 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005701 for (j = 0; j < digits; ++j) {
5702 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005703 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005704 endinpos = (s+j+1)-starts;
5705 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005706 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005707 errors, &errorHandler,
5708 "unicodeescape", message,
5709 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005710 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005711 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005712 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005713 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005714 }
5715 chr = (chr<<4) & ~0xF;
5716 if (c >= '0' && c <= '9')
5717 chr += c - '0';
5718 else if (c >= 'a' && c <= 'f')
5719 chr += 10 + c - 'a';
5720 else
5721 chr += 10 + c - 'A';
5722 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005723 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005724 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005725 /* _decoding_error will have already written into the
5726 target buffer. */
5727 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005728 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005729 /* when we get here, chr is a 32-bit unicode character */
5730 if (chr <= 0xffff)
5731 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005732 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005733 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005734 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005735 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005736#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005737 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005738#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005739 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005740 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5741 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005742#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005743 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005744 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005745 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005746 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005747 errors, &errorHandler,
5748 "unicodeescape", "illegal Unicode character",
5749 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005750 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005751 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005752 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005753 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005754 break;
5755
Benjamin Peterson29060642009-01-31 22:14:21 +00005756 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005757 case 'N':
5758 message = "malformed \\N character escape";
5759 if (ucnhash_CAPI == NULL) {
5760 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005761 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5762 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005763 if (ucnhash_CAPI == NULL)
5764 goto ucnhashError;
5765 }
5766 if (*s == '{') {
5767 const char *start = s+1;
5768 /* look for the closing brace */
5769 while (*s != '}' && s < end)
5770 s++;
5771 if (s > start && s < end && *s == '}') {
5772 /* found a name. look it up in the unicode database */
5773 message = "unknown Unicode character name";
5774 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005775 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5776 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005777 goto store;
5778 }
5779 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005780 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005781 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005782 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005783 errors, &errorHandler,
5784 "unicodeescape", message,
5785 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005786 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005787 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005788 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005789 break;
5790
5791 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005792 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005793 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005794 message = "\\ at end of string";
5795 s--;
5796 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005797 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005798 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005799 errors, &errorHandler,
5800 "unicodeescape", message,
5801 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005802 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005803 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005804 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005805 }
5806 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005807 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5808 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005809 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005810 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005811 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005812 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005813 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005814 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005815 /* Ensure the length prediction worked in case of ASCII strings */
5816 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5817
Victor Stinnerfe226c02011-10-03 03:52:20 +02005818 if (kind == PyUnicode_WCHAR_KIND)
5819 {
5820 if (PyUnicode_Resize((PyObject**)&v, i) < 0)
5821 goto onError;
Victor Stinnerfe226c02011-10-03 03:52:20 +02005822 }
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005823 Py_XDECREF(errorHandler);
5824 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005825#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005826 if (_PyUnicode_READY_REPLACE(&v)) {
5827 Py_DECREF(v);
5828 return NULL;
5829 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005830#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005831 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005832 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005833
Benjamin Peterson29060642009-01-31 22:14:21 +00005834 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005835 PyErr_SetString(
5836 PyExc_UnicodeError,
5837 "\\N escapes not supported (can't load unicodedata module)"
5838 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005839 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005840 Py_XDECREF(errorHandler);
5841 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005842 return NULL;
5843
Benjamin Peterson29060642009-01-31 22:14:21 +00005844 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005845 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005846 Py_XDECREF(errorHandler);
5847 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005848 return NULL;
5849}
5850
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005851#undef WRITE_ASCII_OR_WSTR
5852#undef WRITE_WSTR
5853
Guido van Rossumd57fd912000-03-10 22:53:23 +00005854/* Return a Unicode-Escape string version of the Unicode object.
5855
5856 If quotes is true, the string is enclosed in u"" or u'' quotes as
5857 appropriate.
5858
5859*/
5860
Walter Dörwald79e913e2007-05-12 11:08:06 +00005861static const char *hexdigits = "0123456789abcdef";
5862
Alexander Belopolsky40018472011-02-26 01:02:56 +00005863PyObject *
5864PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005865 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005866{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005867 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005868 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005869
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005870#ifdef Py_UNICODE_WIDE
5871 const Py_ssize_t expandsize = 10;
5872#else
5873 const Py_ssize_t expandsize = 6;
5874#endif
5875
Thomas Wouters89f507f2006-12-13 04:49:30 +00005876 /* XXX(nnorwitz): rather than over-allocating, it would be
5877 better to choose a different scheme. Perhaps scan the
5878 first N-chars of the string and allocate based on that size.
5879 */
5880 /* Initial allocation is based on the longest-possible unichr
5881 escape.
5882
5883 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5884 unichr, so in this case it's the longest unichr escape. In
5885 narrow (UTF-16) builds this is five chars per source unichr
5886 since there are two unichrs in the surrogate pair, so in narrow
5887 (UTF-16) builds it's not the longest unichr escape.
5888
5889 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5890 so in the narrow (UTF-16) build case it's the longest unichr
5891 escape.
5892 */
5893
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005894 if (size == 0)
5895 return PyBytes_FromStringAndSize(NULL, 0);
5896
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005897 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005898 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005899
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005900 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005901 2
5902 + expandsize*size
5903 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005904 if (repr == NULL)
5905 return NULL;
5906
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005907 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005908
Guido van Rossumd57fd912000-03-10 22:53:23 +00005909 while (size-- > 0) {
5910 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005911
Walter Dörwald79e913e2007-05-12 11:08:06 +00005912 /* Escape backslashes */
5913 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005914 *p++ = '\\';
5915 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005916 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005917 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005918
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005919#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005920 /* Map 21-bit characters to '\U00xxxxxx' */
5921 else if (ch >= 0x10000) {
5922 *p++ = '\\';
5923 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005924 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5925 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5926 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5927 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5928 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5929 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5930 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5931 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005932 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005933 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005934#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005935 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5936 else if (ch >= 0xD800 && ch < 0xDC00) {
5937 Py_UNICODE ch2;
5938 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005939
Benjamin Peterson29060642009-01-31 22:14:21 +00005940 ch2 = *s++;
5941 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005942 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005943 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5944 *p++ = '\\';
5945 *p++ = 'U';
5946 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5947 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5948 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5949 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5950 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5951 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5952 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5953 *p++ = hexdigits[ucs & 0x0000000F];
5954 continue;
5955 }
5956 /* Fall through: isolated surrogates are copied as-is */
5957 s--;
5958 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005959 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005960#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005961
Guido van Rossumd57fd912000-03-10 22:53:23 +00005962 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005963 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005964 *p++ = '\\';
5965 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005966 *p++ = hexdigits[(ch >> 12) & 0x000F];
5967 *p++ = hexdigits[(ch >> 8) & 0x000F];
5968 *p++ = hexdigits[(ch >> 4) & 0x000F];
5969 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005970 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005971
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005972 /* Map special whitespace to '\t', \n', '\r' */
5973 else if (ch == '\t') {
5974 *p++ = '\\';
5975 *p++ = 't';
5976 }
5977 else if (ch == '\n') {
5978 *p++ = '\\';
5979 *p++ = 'n';
5980 }
5981 else if (ch == '\r') {
5982 *p++ = '\\';
5983 *p++ = 'r';
5984 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005985
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005986 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005987 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005988 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005989 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005990 *p++ = hexdigits[(ch >> 4) & 0x000F];
5991 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005992 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005993
Guido van Rossumd57fd912000-03-10 22:53:23 +00005994 /* Copy everything else as-is */
5995 else
5996 *p++ = (char) ch;
5997 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005998
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005999 assert(p - PyBytes_AS_STRING(repr) > 0);
6000 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6001 return NULL;
6002 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006003}
6004
Alexander Belopolsky40018472011-02-26 01:02:56 +00006005PyObject *
6006PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006007{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006008 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006009 if (!PyUnicode_Check(unicode)) {
6010 PyErr_BadArgument();
6011 return NULL;
6012 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00006013 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6014 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006015 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006016}
6017
6018/* --- Raw Unicode Escape Codec ------------------------------------------- */
6019
Alexander Belopolsky40018472011-02-26 01:02:56 +00006020PyObject *
6021PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006022 Py_ssize_t size,
6023 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006024{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006025 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006026 Py_ssize_t startinpos;
6027 Py_ssize_t endinpos;
6028 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006029 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006030 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006031 const char *end;
6032 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006033 PyObject *errorHandler = NULL;
6034 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006035
Guido van Rossumd57fd912000-03-10 22:53:23 +00006036 /* Escaped strings will always be longer than the resulting
6037 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006038 length after conversion to the true value. (But decoding error
6039 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006040 v = _PyUnicode_New(size);
6041 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006042 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006043 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006044 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006045 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006046 end = s + size;
6047 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006048 unsigned char c;
6049 Py_UCS4 x;
6050 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006051 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006052
Benjamin Peterson29060642009-01-31 22:14:21 +00006053 /* Non-escape characters are interpreted as Unicode ordinals */
6054 if (*s != '\\') {
6055 *p++ = (unsigned char)*s++;
6056 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006057 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006058 startinpos = s-starts;
6059
6060 /* \u-escapes are only interpreted iff the number of leading
6061 backslashes if odd */
6062 bs = s;
6063 for (;s < end;) {
6064 if (*s != '\\')
6065 break;
6066 *p++ = (unsigned char)*s++;
6067 }
6068 if (((s - bs) & 1) == 0 ||
6069 s >= end ||
6070 (*s != 'u' && *s != 'U')) {
6071 continue;
6072 }
6073 p--;
6074 count = *s=='u' ? 4 : 8;
6075 s++;
6076
6077 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
6078 outpos = p-PyUnicode_AS_UNICODE(v);
6079 for (x = 0, i = 0; i < count; ++i, ++s) {
6080 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006081 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006082 endinpos = s-starts;
6083 if (unicode_decode_call_errorhandler(
6084 errors, &errorHandler,
6085 "rawunicodeescape", "truncated \\uXXXX",
6086 &starts, &end, &startinpos, &endinpos, &exc, &s,
6087 &v, &outpos, &p))
6088 goto onError;
6089 goto nextByte;
6090 }
6091 x = (x<<4) & ~0xF;
6092 if (c >= '0' && c <= '9')
6093 x += c - '0';
6094 else if (c >= 'a' && c <= 'f')
6095 x += 10 + c - 'a';
6096 else
6097 x += 10 + c - 'A';
6098 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00006099 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00006100 /* UCS-2 character */
6101 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006102 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006103 /* UCS-4 character. Either store directly, or as
6104 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00006105#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006106 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006107#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006108 x -= 0x10000L;
6109 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
6110 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00006111#endif
6112 } else {
6113 endinpos = s-starts;
6114 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006115 if (unicode_decode_call_errorhandler(
6116 errors, &errorHandler,
6117 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006118 &starts, &end, &startinpos, &endinpos, &exc, &s,
6119 &v, &outpos, &p))
6120 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006121 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006122 nextByte:
6123 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006124 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02006125 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006126 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006127 Py_XDECREF(errorHandler);
6128 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006129#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006130 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006131 Py_DECREF(v);
6132 return NULL;
6133 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006134#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006135 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006136 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006137
Benjamin Peterson29060642009-01-31 22:14:21 +00006138 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006139 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006140 Py_XDECREF(errorHandler);
6141 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006142 return NULL;
6143}
6144
Alexander Belopolsky40018472011-02-26 01:02:56 +00006145PyObject *
6146PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006147 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006148{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006149 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006150 char *p;
6151 char *q;
6152
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006153#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006154 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006155#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006156 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006157#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00006158
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006159 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006160 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006161
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006162 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006163 if (repr == NULL)
6164 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00006165 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006166 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006167
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006168 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006169 while (size-- > 0) {
6170 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006171#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006172 /* Map 32-bit characters to '\Uxxxxxxxx' */
6173 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006174 *p++ = '\\';
6175 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00006176 *p++ = hexdigits[(ch >> 28) & 0xf];
6177 *p++ = hexdigits[(ch >> 24) & 0xf];
6178 *p++ = hexdigits[(ch >> 20) & 0xf];
6179 *p++ = hexdigits[(ch >> 16) & 0xf];
6180 *p++ = hexdigits[(ch >> 12) & 0xf];
6181 *p++ = hexdigits[(ch >> 8) & 0xf];
6182 *p++ = hexdigits[(ch >> 4) & 0xf];
6183 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006184 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006185 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00006186#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006187 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
6188 if (ch >= 0xD800 && ch < 0xDC00) {
6189 Py_UNICODE ch2;
6190 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006191
Benjamin Peterson29060642009-01-31 22:14:21 +00006192 ch2 = *s++;
6193 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00006194 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006195 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
6196 *p++ = '\\';
6197 *p++ = 'U';
6198 *p++ = hexdigits[(ucs >> 28) & 0xf];
6199 *p++ = hexdigits[(ucs >> 24) & 0xf];
6200 *p++ = hexdigits[(ucs >> 20) & 0xf];
6201 *p++ = hexdigits[(ucs >> 16) & 0xf];
6202 *p++ = hexdigits[(ucs >> 12) & 0xf];
6203 *p++ = hexdigits[(ucs >> 8) & 0xf];
6204 *p++ = hexdigits[(ucs >> 4) & 0xf];
6205 *p++ = hexdigits[ucs & 0xf];
6206 continue;
6207 }
6208 /* Fall through: isolated surrogates are copied as-is */
6209 s--;
6210 size++;
6211 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006212#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006213 /* Map 16-bit characters to '\uxxxx' */
6214 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006215 *p++ = '\\';
6216 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00006217 *p++ = hexdigits[(ch >> 12) & 0xf];
6218 *p++ = hexdigits[(ch >> 8) & 0xf];
6219 *p++ = hexdigits[(ch >> 4) & 0xf];
6220 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006221 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006222 /* Copy everything else as-is */
6223 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006224 *p++ = (char) ch;
6225 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006226 size = p - q;
6227
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006228 assert(size > 0);
6229 if (_PyBytes_Resize(&repr, size) < 0)
6230 return NULL;
6231 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006232}
6233
Alexander Belopolsky40018472011-02-26 01:02:56 +00006234PyObject *
6235PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006236{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006237 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006238 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00006239 PyErr_BadArgument();
6240 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006241 }
Walter Dörwald711005d2007-05-12 12:03:26 +00006242 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6243 PyUnicode_GET_SIZE(unicode));
6244
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006245 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006246}
6247
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006248/* --- Unicode Internal Codec ------------------------------------------- */
6249
Alexander Belopolsky40018472011-02-26 01:02:56 +00006250PyObject *
6251_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006252 Py_ssize_t size,
6253 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006254{
6255 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006256 Py_ssize_t startinpos;
6257 Py_ssize_t endinpos;
6258 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006259 PyUnicodeObject *v;
6260 Py_UNICODE *p;
6261 const char *end;
6262 const char *reason;
6263 PyObject *errorHandler = NULL;
6264 PyObject *exc = NULL;
6265
Neal Norwitzd43069c2006-01-08 01:12:10 +00006266#ifdef Py_UNICODE_WIDE
6267 Py_UNICODE unimax = PyUnicode_GetMax();
6268#endif
6269
Thomas Wouters89f507f2006-12-13 04:49:30 +00006270 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006271 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
6272 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006273 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006274 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
6275 as string was created with the old API. */
6276 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006277 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006278 p = PyUnicode_AS_UNICODE(v);
6279 end = s + size;
6280
6281 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006282 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006283 /* We have to sanity check the raw data, otherwise doom looms for
6284 some malformed UCS-4 data. */
6285 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006286#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006287 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006288#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006289 end-s < Py_UNICODE_SIZE
6290 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006291 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006292 startinpos = s - starts;
6293 if (end-s < Py_UNICODE_SIZE) {
6294 endinpos = end-starts;
6295 reason = "truncated input";
6296 }
6297 else {
6298 endinpos = s - starts + Py_UNICODE_SIZE;
6299 reason = "illegal code point (> 0x10FFFF)";
6300 }
6301 outpos = p - PyUnicode_AS_UNICODE(v);
6302 if (unicode_decode_call_errorhandler(
6303 errors, &errorHandler,
6304 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006305 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00006306 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006307 goto onError;
6308 }
6309 }
6310 else {
6311 p++;
6312 s += Py_UNICODE_SIZE;
6313 }
6314 }
6315
Victor Stinnerfe226c02011-10-03 03:52:20 +02006316 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006317 goto onError;
6318 Py_XDECREF(errorHandler);
6319 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006320#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006321 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006322 Py_DECREF(v);
6323 return NULL;
6324 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006325#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006326 assert(_PyUnicode_CheckConsistency(v, 1));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006327 return (PyObject *)v;
6328
Benjamin Peterson29060642009-01-31 22:14:21 +00006329 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006330 Py_XDECREF(v);
6331 Py_XDECREF(errorHandler);
6332 Py_XDECREF(exc);
6333 return NULL;
6334}
6335
Guido van Rossumd57fd912000-03-10 22:53:23 +00006336/* --- Latin-1 Codec ------------------------------------------------------ */
6337
Alexander Belopolsky40018472011-02-26 01:02:56 +00006338PyObject *
6339PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006340 Py_ssize_t size,
6341 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006342{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006343 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006344 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006345}
6346
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006347/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006348static void
6349make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006350 const char *encoding,
6351 const Py_UNICODE *unicode, Py_ssize_t size,
6352 Py_ssize_t startpos, Py_ssize_t endpos,
6353 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006354{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006355 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006356 *exceptionObject = PyUnicodeEncodeError_Create(
6357 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006358 }
6359 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006360 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6361 goto onError;
6362 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6363 goto onError;
6364 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6365 goto onError;
6366 return;
6367 onError:
6368 Py_DECREF(*exceptionObject);
6369 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006370 }
6371}
6372
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006373/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006374static void
6375raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006376 const char *encoding,
6377 const Py_UNICODE *unicode, Py_ssize_t size,
6378 Py_ssize_t startpos, Py_ssize_t endpos,
6379 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006380{
6381 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006382 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006383 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006384 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006385}
6386
6387/* error handling callback helper:
6388 build arguments, call the callback and check the arguments,
6389 put the result into newpos and return the replacement string, which
6390 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006391static PyObject *
6392unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006393 PyObject **errorHandler,
6394 const char *encoding, const char *reason,
6395 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
6396 Py_ssize_t startpos, Py_ssize_t endpos,
6397 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006398{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006399 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006400
6401 PyObject *restuple;
6402 PyObject *resunicode;
6403
6404 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006405 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006406 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006407 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006408 }
6409
6410 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006411 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006412 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006413 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006414
6415 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006416 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006417 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006418 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006419 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006420 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006421 Py_DECREF(restuple);
6422 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006423 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006424 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006425 &resunicode, newpos)) {
6426 Py_DECREF(restuple);
6427 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006428 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006429 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6430 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6431 Py_DECREF(restuple);
6432 return NULL;
6433 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006434 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006435 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006436 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006437 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6438 Py_DECREF(restuple);
6439 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006440 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006441 Py_INCREF(resunicode);
6442 Py_DECREF(restuple);
6443 return resunicode;
6444}
6445
Alexander Belopolsky40018472011-02-26 01:02:56 +00006446static PyObject *
6447unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006448 Py_ssize_t size,
6449 const char *errors,
6450 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006451{
6452 /* output object */
6453 PyObject *res;
6454 /* pointers to the beginning and end+1 of input */
6455 const Py_UNICODE *startp = p;
6456 const Py_UNICODE *endp = p + size;
6457 /* pointer to the beginning of the unencodable characters */
6458 /* const Py_UNICODE *badp = NULL; */
6459 /* pointer into the output */
6460 char *str;
6461 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006462 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006463 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6464 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006465 PyObject *errorHandler = NULL;
6466 PyObject *exc = NULL;
6467 /* the following variable is used for caching string comparisons
6468 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6469 int known_errorHandler = -1;
6470
6471 /* allocate enough for a simple encoding without
6472 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006473 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006474 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006475 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006476 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006477 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006478 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006479 ressize = size;
6480
6481 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006482 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006483
Benjamin Peterson29060642009-01-31 22:14:21 +00006484 /* can we encode this? */
6485 if (c<limit) {
6486 /* no overflow check, because we know that the space is enough */
6487 *str++ = (char)c;
6488 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006489 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006490 else {
6491 Py_ssize_t unicodepos = p-startp;
6492 Py_ssize_t requiredsize;
6493 PyObject *repunicode;
6494 Py_ssize_t repsize;
6495 Py_ssize_t newpos;
6496 Py_ssize_t respos;
6497 Py_UNICODE *uni2;
6498 /* startpos for collecting unencodable chars */
6499 const Py_UNICODE *collstart = p;
6500 const Py_UNICODE *collend = p;
6501 /* find all unecodable characters */
6502 while ((collend < endp) && ((*collend)>=limit))
6503 ++collend;
6504 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6505 if (known_errorHandler==-1) {
6506 if ((errors==NULL) || (!strcmp(errors, "strict")))
6507 known_errorHandler = 1;
6508 else if (!strcmp(errors, "replace"))
6509 known_errorHandler = 2;
6510 else if (!strcmp(errors, "ignore"))
6511 known_errorHandler = 3;
6512 else if (!strcmp(errors, "xmlcharrefreplace"))
6513 known_errorHandler = 4;
6514 else
6515 known_errorHandler = 0;
6516 }
6517 switch (known_errorHandler) {
6518 case 1: /* strict */
6519 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
6520 goto onError;
6521 case 2: /* replace */
6522 while (collstart++<collend)
6523 *str++ = '?'; /* fall through */
6524 case 3: /* ignore */
6525 p = collend;
6526 break;
6527 case 4: /* xmlcharrefreplace */
6528 respos = str - PyBytes_AS_STRING(res);
6529 /* determine replacement size (temporarily (mis)uses p) */
6530 for (p = collstart, repsize = 0; p < collend; ++p) {
6531 if (*p<10)
6532 repsize += 2+1+1;
6533 else if (*p<100)
6534 repsize += 2+2+1;
6535 else if (*p<1000)
6536 repsize += 2+3+1;
6537 else if (*p<10000)
6538 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006539#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006540 else
6541 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006542#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006543 else if (*p<100000)
6544 repsize += 2+5+1;
6545 else if (*p<1000000)
6546 repsize += 2+6+1;
6547 else
6548 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006549#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006550 }
6551 requiredsize = respos+repsize+(endp-collend);
6552 if (requiredsize > ressize) {
6553 if (requiredsize<2*ressize)
6554 requiredsize = 2*ressize;
6555 if (_PyBytes_Resize(&res, requiredsize))
6556 goto onError;
6557 str = PyBytes_AS_STRING(res) + respos;
6558 ressize = requiredsize;
6559 }
6560 /* generate replacement (temporarily (mis)uses p) */
6561 for (p = collstart; p < collend; ++p) {
6562 str += sprintf(str, "&#%d;", (int)*p);
6563 }
6564 p = collend;
6565 break;
6566 default:
6567 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6568 encoding, reason, startp, size, &exc,
6569 collstart-startp, collend-startp, &newpos);
6570 if (repunicode == NULL)
6571 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006572 if (PyBytes_Check(repunicode)) {
6573 /* Directly copy bytes result to output. */
6574 repsize = PyBytes_Size(repunicode);
6575 if (repsize > 1) {
6576 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006577 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006578 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6579 Py_DECREF(repunicode);
6580 goto onError;
6581 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006582 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006583 ressize += repsize-1;
6584 }
6585 memcpy(str, PyBytes_AsString(repunicode), repsize);
6586 str += repsize;
6587 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006588 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006589 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006590 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006591 /* need more space? (at least enough for what we
6592 have+the replacement+the rest of the string, so
6593 we won't have to check space for encodable characters) */
6594 respos = str - PyBytes_AS_STRING(res);
6595 repsize = PyUnicode_GET_SIZE(repunicode);
6596 requiredsize = respos+repsize+(endp-collend);
6597 if (requiredsize > ressize) {
6598 if (requiredsize<2*ressize)
6599 requiredsize = 2*ressize;
6600 if (_PyBytes_Resize(&res, requiredsize)) {
6601 Py_DECREF(repunicode);
6602 goto onError;
6603 }
6604 str = PyBytes_AS_STRING(res) + respos;
6605 ressize = requiredsize;
6606 }
6607 /* check if there is anything unencodable in the replacement
6608 and copy it to the output */
6609 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
6610 c = *uni2;
6611 if (c >= limit) {
6612 raise_encode_exception(&exc, encoding, startp, size,
6613 unicodepos, unicodepos+1, reason);
6614 Py_DECREF(repunicode);
6615 goto onError;
6616 }
6617 *str = (char)c;
6618 }
6619 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006620 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006621 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006622 }
6623 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006624 /* Resize if we allocated to much */
6625 size = str - PyBytes_AS_STRING(res);
6626 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006627 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006628 if (_PyBytes_Resize(&res, size) < 0)
6629 goto onError;
6630 }
6631
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006632 Py_XDECREF(errorHandler);
6633 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006634 return res;
6635
6636 onError:
6637 Py_XDECREF(res);
6638 Py_XDECREF(errorHandler);
6639 Py_XDECREF(exc);
6640 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006641}
6642
Alexander Belopolsky40018472011-02-26 01:02:56 +00006643PyObject *
6644PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006645 Py_ssize_t size,
6646 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006647{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006648 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006649}
6650
Alexander Belopolsky40018472011-02-26 01:02:56 +00006651PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006652_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006653{
6654 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006655 PyErr_BadArgument();
6656 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006657 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006658 if (PyUnicode_READY(unicode) == -1)
6659 return NULL;
6660 /* Fast path: if it is a one-byte string, construct
6661 bytes object directly. */
6662 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6663 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6664 PyUnicode_GET_LENGTH(unicode));
6665 /* Non-Latin-1 characters present. Defer to above function to
6666 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006667 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006668 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006669 errors);
6670}
6671
6672PyObject*
6673PyUnicode_AsLatin1String(PyObject *unicode)
6674{
6675 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006676}
6677
6678/* --- 7-bit ASCII Codec -------------------------------------------------- */
6679
Alexander Belopolsky40018472011-02-26 01:02:56 +00006680PyObject *
6681PyUnicode_DecodeASCII(const char *s,
6682 Py_ssize_t size,
6683 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006684{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006685 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006686 PyUnicodeObject *v;
Victor Stinner702c7342011-10-05 13:50:52 +02006687 Py_UNICODE *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006688 Py_ssize_t startinpos;
6689 Py_ssize_t endinpos;
6690 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006691 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006692 int has_error;
6693 const unsigned char *p = (const unsigned char *)s;
6694 const unsigned char *end = p + size;
6695 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006696 PyObject *errorHandler = NULL;
6697 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006698
Guido van Rossumd57fd912000-03-10 22:53:23 +00006699 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006700 if (size == 1 && (unsigned char)s[0] < 128)
6701 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006702
Victor Stinner702c7342011-10-05 13:50:52 +02006703 has_error = 0;
6704 while (p < end && !has_error) {
6705 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6706 an explanation. */
6707 if (!((size_t) p & LONG_PTR_MASK)) {
6708 /* Help register allocation */
6709 register const unsigned char *_p = p;
6710 while (_p < aligned_end) {
6711 unsigned long value = *(unsigned long *) _p;
6712 if (value & ASCII_CHAR_MASK) {
6713 has_error = 1;
6714 break;
6715 }
6716 _p += SIZEOF_LONG;
6717 }
6718 if (_p == end)
6719 break;
6720 if (has_error)
6721 break;
6722 p = _p;
6723 }
6724 if (*p & 0x80) {
6725 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006726 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006727 }
6728 else {
6729 ++p;
6730 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006731 }
Victor Stinner702c7342011-10-05 13:50:52 +02006732 if (!has_error)
6733 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006734
Guido van Rossumd57fd912000-03-10 22:53:23 +00006735 v = _PyUnicode_New(size);
6736 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006737 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006738 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006739 return (PyObject *)v;
Victor Stinner702c7342011-10-05 13:50:52 +02006740 u = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006741 e = s + size;
6742 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006743 register unsigned char c = (unsigned char)*s;
6744 if (c < 128) {
Victor Stinner702c7342011-10-05 13:50:52 +02006745 *u++ = c;
Benjamin Peterson29060642009-01-31 22:14:21 +00006746 ++s;
6747 }
6748 else {
6749 startinpos = s-starts;
6750 endinpos = startinpos + 1;
Victor Stinner702c7342011-10-05 13:50:52 +02006751 outpos = u - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006752 if (unicode_decode_call_errorhandler(
6753 errors, &errorHandler,
6754 "ascii", "ordinal not in range(128)",
6755 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinner702c7342011-10-05 13:50:52 +02006756 &v, &outpos, &u))
Benjamin Peterson29060642009-01-31 22:14:21 +00006757 goto onError;
6758 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006759 }
Victor Stinner702c7342011-10-05 13:50:52 +02006760 if (u - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
6761 if (PyUnicode_Resize((PyObject**)&v, u - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006762 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006763 Py_XDECREF(errorHandler);
6764 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006765#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006766 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006767 Py_DECREF(v);
6768 return NULL;
6769 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006770#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006771 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006772 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006773
Benjamin Peterson29060642009-01-31 22:14:21 +00006774 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006775 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006776 Py_XDECREF(errorHandler);
6777 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006778 return NULL;
6779}
6780
Alexander Belopolsky40018472011-02-26 01:02:56 +00006781PyObject *
6782PyUnicode_EncodeASCII(const Py_UNICODE *p,
6783 Py_ssize_t size,
6784 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006785{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006786 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006787}
6788
Alexander Belopolsky40018472011-02-26 01:02:56 +00006789PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006790_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006791{
6792 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006793 PyErr_BadArgument();
6794 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006795 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006796 if (PyUnicode_READY(unicode) == -1)
6797 return NULL;
6798 /* Fast path: if it is an ASCII-only string, construct bytes object
6799 directly. Else defer to above function to raise the exception. */
6800 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6801 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6802 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006803 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006804 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006805 errors);
6806}
6807
6808PyObject *
6809PyUnicode_AsASCIIString(PyObject *unicode)
6810{
6811 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006812}
6813
Victor Stinner99b95382011-07-04 14:23:54 +02006814#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006815
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006816/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006817
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006818#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006819#define NEED_RETRY
6820#endif
6821
6822/* XXX This code is limited to "true" double-byte encodings, as
6823 a) it assumes an incomplete character consists of a single byte, and
6824 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006825 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006826
Alexander Belopolsky40018472011-02-26 01:02:56 +00006827static int
6828is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006829{
6830 const char *curr = s + offset;
6831
6832 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006833 const char *prev = CharPrev(s, curr);
6834 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006835 }
6836 return 0;
6837}
6838
6839/*
6840 * Decode MBCS string into unicode object. If 'final' is set, converts
6841 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6842 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006843static int
6844decode_mbcs(PyUnicodeObject **v,
6845 const char *s, /* MBCS string */
6846 int size, /* sizeof MBCS string */
6847 int final,
6848 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006849{
6850 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006851 Py_ssize_t n;
6852 DWORD usize;
6853 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006854
6855 assert(size >= 0);
6856
Victor Stinner554f3f02010-06-16 23:33:54 +00006857 /* check and handle 'errors' arg */
6858 if (errors==NULL || strcmp(errors, "strict")==0)
6859 flags = MB_ERR_INVALID_CHARS;
6860 else if (strcmp(errors, "ignore")==0)
6861 flags = 0;
6862 else {
6863 PyErr_Format(PyExc_ValueError,
6864 "mbcs encoding does not support errors='%s'",
6865 errors);
6866 return -1;
6867 }
6868
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006869 /* Skip trailing lead-byte unless 'final' is set */
6870 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006871 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006872
6873 /* First get the size of the result */
6874 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006875 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6876 if (usize==0)
6877 goto mbcs_decode_error;
6878 } else
6879 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006880
6881 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006882 /* Create unicode object */
6883 *v = _PyUnicode_New(usize);
6884 if (*v == NULL)
6885 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006886 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006887 }
6888 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006889 /* Extend unicode object */
6890 n = PyUnicode_GET_SIZE(*v);
Victor Stinner2fd82272011-10-03 04:06:05 +02006891 if (PyUnicode_Resize((PyObject**)v, n + usize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006892 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006893 }
6894
6895 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006896 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006897 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006898 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6899 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006900 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006901 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006902 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006903
6904mbcs_decode_error:
6905 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6906 we raise a UnicodeDecodeError - else it is a 'generic'
6907 windows error
6908 */
6909 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6910 /* Ideally, we should get reason from FormatMessage - this
6911 is the Windows 2000 English version of the message
6912 */
6913 PyObject *exc = NULL;
6914 const char *reason = "No mapping for the Unicode character exists "
6915 "in the target multi-byte code page.";
6916 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6917 if (exc != NULL) {
6918 PyCodec_StrictErrors(exc);
6919 Py_DECREF(exc);
6920 }
6921 } else {
6922 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6923 }
6924 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006925}
6926
Alexander Belopolsky40018472011-02-26 01:02:56 +00006927PyObject *
6928PyUnicode_DecodeMBCSStateful(const char *s,
6929 Py_ssize_t size,
6930 const char *errors,
6931 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006932{
6933 PyUnicodeObject *v = NULL;
6934 int done;
6935
6936 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006937 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006938
6939#ifdef NEED_RETRY
6940 retry:
6941 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006942 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006943 else
6944#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006945 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006946
6947 if (done < 0) {
6948 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006949 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006950 }
6951
6952 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006953 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006954
6955#ifdef NEED_RETRY
6956 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006957 s += done;
6958 size -= done;
6959 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006960 }
6961#endif
Victor Stinner17efeed2011-10-04 20:05:46 +02006962#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006963 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006964 Py_DECREF(v);
6965 return NULL;
6966 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006967#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006968 assert(_PyUnicode_CheckConsistency(v, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006969 return (PyObject *)v;
6970}
6971
Alexander Belopolsky40018472011-02-26 01:02:56 +00006972PyObject *
6973PyUnicode_DecodeMBCS(const char *s,
6974 Py_ssize_t size,
6975 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006976{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006977 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6978}
6979
6980/*
6981 * Convert unicode into string object (MBCS).
6982 * Returns 0 if succeed, -1 otherwise.
6983 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006984static int
6985encode_mbcs(PyObject **repr,
6986 const Py_UNICODE *p, /* unicode */
6987 int size, /* size of unicode */
6988 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006989{
Victor Stinner554f3f02010-06-16 23:33:54 +00006990 BOOL usedDefaultChar = FALSE;
6991 BOOL *pusedDefaultChar;
6992 int mbcssize;
6993 Py_ssize_t n;
6994 PyObject *exc = NULL;
6995 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006996
6997 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006998
Victor Stinner554f3f02010-06-16 23:33:54 +00006999 /* check and handle 'errors' arg */
7000 if (errors==NULL || strcmp(errors, "strict")==0) {
7001 flags = WC_NO_BEST_FIT_CHARS;
7002 pusedDefaultChar = &usedDefaultChar;
7003 } else if (strcmp(errors, "replace")==0) {
7004 flags = 0;
7005 pusedDefaultChar = NULL;
7006 } else {
7007 PyErr_Format(PyExc_ValueError,
7008 "mbcs encoding does not support errors='%s'",
7009 errors);
7010 return -1;
7011 }
7012
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007013 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007014 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00007015 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
7016 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00007017 if (mbcssize == 0) {
7018 PyErr_SetFromWindowsErrWithFilename(0, NULL);
7019 return -1;
7020 }
Victor Stinner554f3f02010-06-16 23:33:54 +00007021 /* If we used a default char, then we failed! */
7022 if (pusedDefaultChar && *pusedDefaultChar)
7023 goto mbcs_encode_error;
7024 } else {
7025 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007026 }
7027
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007028 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007029 /* Create string object */
7030 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
7031 if (*repr == NULL)
7032 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00007033 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007034 }
7035 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007036 /* Extend string object */
7037 n = PyBytes_Size(*repr);
7038 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
7039 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007040 }
7041
7042 /* Do the conversion */
7043 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007044 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00007045 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
7046 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007047 PyErr_SetFromWindowsErrWithFilename(0, NULL);
7048 return -1;
7049 }
Victor Stinner554f3f02010-06-16 23:33:54 +00007050 if (pusedDefaultChar && *pusedDefaultChar)
7051 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007052 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007053 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007054
7055mbcs_encode_error:
7056 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
7057 Py_XDECREF(exc);
7058 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007059}
7060
Alexander Belopolsky40018472011-02-26 01:02:56 +00007061PyObject *
7062PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7063 Py_ssize_t size,
7064 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007065{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007066 PyObject *repr = NULL;
7067 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00007068
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007069#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00007070 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007071 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00007072 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007073 else
7074#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00007075 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007076
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007077 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007078 Py_XDECREF(repr);
7079 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007080 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007081
7082#ifdef NEED_RETRY
7083 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007084 p += INT_MAX;
7085 size -= INT_MAX;
7086 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007087 }
7088#endif
7089
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007090 return repr;
7091}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007092
Alexander Belopolsky40018472011-02-26 01:02:56 +00007093PyObject *
7094PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007095{
7096 if (!PyUnicode_Check(unicode)) {
7097 PyErr_BadArgument();
7098 return NULL;
7099 }
7100 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007101 PyUnicode_GET_SIZE(unicode),
7102 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007103}
7104
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007105#undef NEED_RETRY
7106
Victor Stinner99b95382011-07-04 14:23:54 +02007107#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007108
Guido van Rossumd57fd912000-03-10 22:53:23 +00007109/* --- Character Mapping Codec -------------------------------------------- */
7110
Alexander Belopolsky40018472011-02-26 01:02:56 +00007111PyObject *
7112PyUnicode_DecodeCharmap(const char *s,
7113 Py_ssize_t size,
7114 PyObject *mapping,
7115 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007116{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007117 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007118 Py_ssize_t startinpos;
7119 Py_ssize_t endinpos;
7120 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007121 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007122 PyUnicodeObject *v;
7123 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007124 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007125 PyObject *errorHandler = NULL;
7126 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007127 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007128 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00007129
Guido van Rossumd57fd912000-03-10 22:53:23 +00007130 /* Default to Latin-1 */
7131 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007132 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007133
7134 v = _PyUnicode_New(size);
7135 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007136 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007137 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007138 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007139 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007140 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007141 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007142 mapstring = PyUnicode_AS_UNICODE(mapping);
7143 maplen = PyUnicode_GET_SIZE(mapping);
7144 while (s < e) {
7145 unsigned char ch = *s;
7146 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007147
Benjamin Peterson29060642009-01-31 22:14:21 +00007148 if (ch < maplen)
7149 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00007150
Benjamin Peterson29060642009-01-31 22:14:21 +00007151 if (x == 0xfffe) {
7152 /* undefined mapping */
7153 outpos = p-PyUnicode_AS_UNICODE(v);
7154 startinpos = s-starts;
7155 endinpos = startinpos+1;
7156 if (unicode_decode_call_errorhandler(
7157 errors, &errorHandler,
7158 "charmap", "character maps to <undefined>",
7159 &starts, &e, &startinpos, &endinpos, &exc, &s,
7160 &v, &outpos, &p)) {
7161 goto onError;
7162 }
7163 continue;
7164 }
7165 *p++ = x;
7166 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007167 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007168 }
7169 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007170 while (s < e) {
7171 unsigned char ch = *s;
7172 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007173
Benjamin Peterson29060642009-01-31 22:14:21 +00007174 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7175 w = PyLong_FromLong((long)ch);
7176 if (w == NULL)
7177 goto onError;
7178 x = PyObject_GetItem(mapping, w);
7179 Py_DECREF(w);
7180 if (x == NULL) {
7181 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7182 /* No mapping found means: mapping is undefined. */
7183 PyErr_Clear();
7184 x = Py_None;
7185 Py_INCREF(x);
7186 } else
7187 goto onError;
7188 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007189
Benjamin Peterson29060642009-01-31 22:14:21 +00007190 /* Apply mapping */
7191 if (PyLong_Check(x)) {
7192 long value = PyLong_AS_LONG(x);
7193 if (value < 0 || value > 65535) {
7194 PyErr_SetString(PyExc_TypeError,
7195 "character mapping must be in range(65536)");
7196 Py_DECREF(x);
7197 goto onError;
7198 }
7199 *p++ = (Py_UNICODE)value;
7200 }
7201 else if (x == Py_None) {
7202 /* undefined mapping */
7203 outpos = p-PyUnicode_AS_UNICODE(v);
7204 startinpos = s-starts;
7205 endinpos = startinpos+1;
7206 if (unicode_decode_call_errorhandler(
7207 errors, &errorHandler,
7208 "charmap", "character maps to <undefined>",
7209 &starts, &e, &startinpos, &endinpos, &exc, &s,
7210 &v, &outpos, &p)) {
7211 Py_DECREF(x);
7212 goto onError;
7213 }
7214 Py_DECREF(x);
7215 continue;
7216 }
7217 else if (PyUnicode_Check(x)) {
7218 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007219
Benjamin Peterson29060642009-01-31 22:14:21 +00007220 if (targetsize == 1)
7221 /* 1-1 mapping */
7222 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007223
Benjamin Peterson29060642009-01-31 22:14:21 +00007224 else if (targetsize > 1) {
7225 /* 1-n mapping */
7226 if (targetsize > extrachars) {
7227 /* resize first */
7228 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
7229 Py_ssize_t needed = (targetsize - extrachars) + \
7230 (targetsize << 2);
7231 extrachars += needed;
7232 /* XXX overflow detection missing */
Victor Stinnerfe226c02011-10-03 03:52:20 +02007233 if (PyUnicode_Resize((PyObject**)&v,
Benjamin Peterson29060642009-01-31 22:14:21 +00007234 PyUnicode_GET_SIZE(v) + needed) < 0) {
7235 Py_DECREF(x);
7236 goto onError;
7237 }
7238 p = PyUnicode_AS_UNICODE(v) + oldpos;
7239 }
7240 Py_UNICODE_COPY(p,
7241 PyUnicode_AS_UNICODE(x),
7242 targetsize);
7243 p += targetsize;
7244 extrachars -= targetsize;
7245 }
7246 /* 1-0 mapping: skip the character */
7247 }
7248 else {
7249 /* wrong return value */
7250 PyErr_SetString(PyExc_TypeError,
7251 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007252 Py_DECREF(x);
7253 goto onError;
7254 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007255 Py_DECREF(x);
7256 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007257 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007258 }
7259 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02007260 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007261 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007262 Py_XDECREF(errorHandler);
7263 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02007264#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007265 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007266 Py_DECREF(v);
7267 return NULL;
7268 }
Victor Stinner17efeed2011-10-04 20:05:46 +02007269#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007270 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007271 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00007272
Benjamin Peterson29060642009-01-31 22:14:21 +00007273 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007274 Py_XDECREF(errorHandler);
7275 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007276 Py_XDECREF(v);
7277 return NULL;
7278}
7279
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007280/* Charmap encoding: the lookup table */
7281
Alexander Belopolsky40018472011-02-26 01:02:56 +00007282struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007283 PyObject_HEAD
7284 unsigned char level1[32];
7285 int count2, count3;
7286 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007287};
7288
7289static PyObject*
7290encoding_map_size(PyObject *obj, PyObject* args)
7291{
7292 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007293 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007294 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007295}
7296
7297static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007298 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007299 PyDoc_STR("Return the size (in bytes) of this object") },
7300 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007301};
7302
7303static void
7304encoding_map_dealloc(PyObject* o)
7305{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007306 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007307}
7308
7309static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007310 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007311 "EncodingMap", /*tp_name*/
7312 sizeof(struct encoding_map), /*tp_basicsize*/
7313 0, /*tp_itemsize*/
7314 /* methods */
7315 encoding_map_dealloc, /*tp_dealloc*/
7316 0, /*tp_print*/
7317 0, /*tp_getattr*/
7318 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007319 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007320 0, /*tp_repr*/
7321 0, /*tp_as_number*/
7322 0, /*tp_as_sequence*/
7323 0, /*tp_as_mapping*/
7324 0, /*tp_hash*/
7325 0, /*tp_call*/
7326 0, /*tp_str*/
7327 0, /*tp_getattro*/
7328 0, /*tp_setattro*/
7329 0, /*tp_as_buffer*/
7330 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7331 0, /*tp_doc*/
7332 0, /*tp_traverse*/
7333 0, /*tp_clear*/
7334 0, /*tp_richcompare*/
7335 0, /*tp_weaklistoffset*/
7336 0, /*tp_iter*/
7337 0, /*tp_iternext*/
7338 encoding_map_methods, /*tp_methods*/
7339 0, /*tp_members*/
7340 0, /*tp_getset*/
7341 0, /*tp_base*/
7342 0, /*tp_dict*/
7343 0, /*tp_descr_get*/
7344 0, /*tp_descr_set*/
7345 0, /*tp_dictoffset*/
7346 0, /*tp_init*/
7347 0, /*tp_alloc*/
7348 0, /*tp_new*/
7349 0, /*tp_free*/
7350 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007351};
7352
7353PyObject*
7354PyUnicode_BuildEncodingMap(PyObject* string)
7355{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007356 PyObject *result;
7357 struct encoding_map *mresult;
7358 int i;
7359 int need_dict = 0;
7360 unsigned char level1[32];
7361 unsigned char level2[512];
7362 unsigned char *mlevel1, *mlevel2, *mlevel3;
7363 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007364 int kind;
7365 void *data;
7366 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007367
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007368 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007369 PyErr_BadArgument();
7370 return NULL;
7371 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007372 kind = PyUnicode_KIND(string);
7373 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007374 memset(level1, 0xFF, sizeof level1);
7375 memset(level2, 0xFF, sizeof level2);
7376
7377 /* If there isn't a one-to-one mapping of NULL to \0,
7378 or if there are non-BMP characters, we need to use
7379 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007380 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007381 need_dict = 1;
7382 for (i = 1; i < 256; i++) {
7383 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007384 ch = PyUnicode_READ(kind, data, i);
7385 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007386 need_dict = 1;
7387 break;
7388 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007389 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007390 /* unmapped character */
7391 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007392 l1 = ch >> 11;
7393 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007394 if (level1[l1] == 0xFF)
7395 level1[l1] = count2++;
7396 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007397 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007398 }
7399
7400 if (count2 >= 0xFF || count3 >= 0xFF)
7401 need_dict = 1;
7402
7403 if (need_dict) {
7404 PyObject *result = PyDict_New();
7405 PyObject *key, *value;
7406 if (!result)
7407 return NULL;
7408 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007409 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007410 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007411 if (!key || !value)
7412 goto failed1;
7413 if (PyDict_SetItem(result, key, value) == -1)
7414 goto failed1;
7415 Py_DECREF(key);
7416 Py_DECREF(value);
7417 }
7418 return result;
7419 failed1:
7420 Py_XDECREF(key);
7421 Py_XDECREF(value);
7422 Py_DECREF(result);
7423 return NULL;
7424 }
7425
7426 /* Create a three-level trie */
7427 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7428 16*count2 + 128*count3 - 1);
7429 if (!result)
7430 return PyErr_NoMemory();
7431 PyObject_Init(result, &EncodingMapType);
7432 mresult = (struct encoding_map*)result;
7433 mresult->count2 = count2;
7434 mresult->count3 = count3;
7435 mlevel1 = mresult->level1;
7436 mlevel2 = mresult->level23;
7437 mlevel3 = mresult->level23 + 16*count2;
7438 memcpy(mlevel1, level1, 32);
7439 memset(mlevel2, 0xFF, 16*count2);
7440 memset(mlevel3, 0, 128*count3);
7441 count3 = 0;
7442 for (i = 1; i < 256; i++) {
7443 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007444 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007445 /* unmapped character */
7446 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007447 o1 = PyUnicode_READ(kind, data, i)>>11;
7448 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007449 i2 = 16*mlevel1[o1] + o2;
7450 if (mlevel2[i2] == 0xFF)
7451 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007452 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007453 i3 = 128*mlevel2[i2] + o3;
7454 mlevel3[i3] = i;
7455 }
7456 return result;
7457}
7458
7459static int
7460encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
7461{
7462 struct encoding_map *map = (struct encoding_map*)mapping;
7463 int l1 = c>>11;
7464 int l2 = (c>>7) & 0xF;
7465 int l3 = c & 0x7F;
7466 int i;
7467
7468#ifdef Py_UNICODE_WIDE
7469 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007470 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007471 }
7472#endif
7473 if (c == 0)
7474 return 0;
7475 /* level 1*/
7476 i = map->level1[l1];
7477 if (i == 0xFF) {
7478 return -1;
7479 }
7480 /* level 2*/
7481 i = map->level23[16*i+l2];
7482 if (i == 0xFF) {
7483 return -1;
7484 }
7485 /* level 3 */
7486 i = map->level23[16*map->count2 + 128*i + l3];
7487 if (i == 0) {
7488 return -1;
7489 }
7490 return i;
7491}
7492
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007493/* Lookup the character ch in the mapping. If the character
7494 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007495 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007496static PyObject *
7497charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007498{
Christian Heimes217cfd12007-12-02 14:31:20 +00007499 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007500 PyObject *x;
7501
7502 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007503 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007504 x = PyObject_GetItem(mapping, w);
7505 Py_DECREF(w);
7506 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007507 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7508 /* No mapping found means: mapping is undefined. */
7509 PyErr_Clear();
7510 x = Py_None;
7511 Py_INCREF(x);
7512 return x;
7513 } else
7514 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007515 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007516 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007517 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007518 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007519 long value = PyLong_AS_LONG(x);
7520 if (value < 0 || value > 255) {
7521 PyErr_SetString(PyExc_TypeError,
7522 "character mapping must be in range(256)");
7523 Py_DECREF(x);
7524 return NULL;
7525 }
7526 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007527 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007528 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007529 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007530 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007531 /* wrong return value */
7532 PyErr_Format(PyExc_TypeError,
7533 "character mapping must return integer, bytes or None, not %.400s",
7534 x->ob_type->tp_name);
7535 Py_DECREF(x);
7536 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007537 }
7538}
7539
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007540static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007541charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007542{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007543 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7544 /* exponentially overallocate to minimize reallocations */
7545 if (requiredsize < 2*outsize)
7546 requiredsize = 2*outsize;
7547 if (_PyBytes_Resize(outobj, requiredsize))
7548 return -1;
7549 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007550}
7551
Benjamin Peterson14339b62009-01-31 16:36:08 +00007552typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007553 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007554} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007555/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007556 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007557 space is available. Return a new reference to the object that
7558 was put in the output buffer, or Py_None, if the mapping was undefined
7559 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007560 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007561static charmapencode_result
7562charmapencode_output(Py_UNICODE c, PyObject *mapping,
7563 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007564{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007565 PyObject *rep;
7566 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007567 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007568
Christian Heimes90aa7642007-12-19 02:45:37 +00007569 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007570 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007571 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007572 if (res == -1)
7573 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007574 if (outsize<requiredsize)
7575 if (charmapencode_resize(outobj, outpos, requiredsize))
7576 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007577 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007578 outstart[(*outpos)++] = (char)res;
7579 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007580 }
7581
7582 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007583 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007584 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007585 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007586 Py_DECREF(rep);
7587 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007588 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007589 if (PyLong_Check(rep)) {
7590 Py_ssize_t requiredsize = *outpos+1;
7591 if (outsize<requiredsize)
7592 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7593 Py_DECREF(rep);
7594 return enc_EXCEPTION;
7595 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007596 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007597 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007598 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007599 else {
7600 const char *repchars = PyBytes_AS_STRING(rep);
7601 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7602 Py_ssize_t requiredsize = *outpos+repsize;
7603 if (outsize<requiredsize)
7604 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7605 Py_DECREF(rep);
7606 return enc_EXCEPTION;
7607 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007608 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007609 memcpy(outstart + *outpos, repchars, repsize);
7610 *outpos += repsize;
7611 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007612 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007613 Py_DECREF(rep);
7614 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007615}
7616
7617/* handle an error in PyUnicode_EncodeCharmap
7618 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007619static int
7620charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00007621 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007622 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007623 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007624 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007625{
7626 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007627 Py_ssize_t repsize;
7628 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007629 Py_UNICODE *uni2;
7630 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007631 Py_ssize_t collstartpos = *inpos;
7632 Py_ssize_t collendpos = *inpos+1;
7633 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007634 char *encoding = "charmap";
7635 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007636 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007637
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007638 /* find all unencodable characters */
7639 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007640 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007641 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007642 int res = encoding_map_lookup(p[collendpos], mapping);
7643 if (res != -1)
7644 break;
7645 ++collendpos;
7646 continue;
7647 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007648
Benjamin Peterson29060642009-01-31 22:14:21 +00007649 rep = charmapencode_lookup(p[collendpos], mapping);
7650 if (rep==NULL)
7651 return -1;
7652 else if (rep!=Py_None) {
7653 Py_DECREF(rep);
7654 break;
7655 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007656 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007657 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007658 }
7659 /* cache callback name lookup
7660 * (if not done yet, i.e. it's the first error) */
7661 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007662 if ((errors==NULL) || (!strcmp(errors, "strict")))
7663 *known_errorHandler = 1;
7664 else if (!strcmp(errors, "replace"))
7665 *known_errorHandler = 2;
7666 else if (!strcmp(errors, "ignore"))
7667 *known_errorHandler = 3;
7668 else if (!strcmp(errors, "xmlcharrefreplace"))
7669 *known_errorHandler = 4;
7670 else
7671 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007672 }
7673 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007674 case 1: /* strict */
7675 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7676 return -1;
7677 case 2: /* replace */
7678 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007679 x = charmapencode_output('?', mapping, res, respos);
7680 if (x==enc_EXCEPTION) {
7681 return -1;
7682 }
7683 else if (x==enc_FAILED) {
7684 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7685 return -1;
7686 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007687 }
7688 /* fall through */
7689 case 3: /* ignore */
7690 *inpos = collendpos;
7691 break;
7692 case 4: /* xmlcharrefreplace */
7693 /* generate replacement (temporarily (mis)uses p) */
7694 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007695 char buffer[2+29+1+1];
7696 char *cp;
7697 sprintf(buffer, "&#%d;", (int)p[collpos]);
7698 for (cp = buffer; *cp; ++cp) {
7699 x = charmapencode_output(*cp, mapping, res, respos);
7700 if (x==enc_EXCEPTION)
7701 return -1;
7702 else if (x==enc_FAILED) {
7703 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7704 return -1;
7705 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007706 }
7707 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007708 *inpos = collendpos;
7709 break;
7710 default:
7711 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00007712 encoding, reason, p, size, exceptionObject,
7713 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007714 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007715 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007716 if (PyBytes_Check(repunicode)) {
7717 /* Directly copy bytes result to output. */
7718 Py_ssize_t outsize = PyBytes_Size(*res);
7719 Py_ssize_t requiredsize;
7720 repsize = PyBytes_Size(repunicode);
7721 requiredsize = *respos + repsize;
7722 if (requiredsize > outsize)
7723 /* Make room for all additional bytes. */
7724 if (charmapencode_resize(res, respos, requiredsize)) {
7725 Py_DECREF(repunicode);
7726 return -1;
7727 }
7728 memcpy(PyBytes_AsString(*res) + *respos,
7729 PyBytes_AsString(repunicode), repsize);
7730 *respos += repsize;
7731 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007732 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007733 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007734 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007735 /* generate replacement */
7736 repsize = PyUnicode_GET_SIZE(repunicode);
7737 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007738 x = charmapencode_output(*uni2, mapping, res, respos);
7739 if (x==enc_EXCEPTION) {
7740 return -1;
7741 }
7742 else if (x==enc_FAILED) {
7743 Py_DECREF(repunicode);
7744 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7745 return -1;
7746 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007747 }
7748 *inpos = newpos;
7749 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007750 }
7751 return 0;
7752}
7753
Alexander Belopolsky40018472011-02-26 01:02:56 +00007754PyObject *
7755PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7756 Py_ssize_t size,
7757 PyObject *mapping,
7758 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007759{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007760 /* output object */
7761 PyObject *res = NULL;
7762 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007763 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007764 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007765 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007766 PyObject *errorHandler = NULL;
7767 PyObject *exc = NULL;
7768 /* the following variable is used for caching string comparisons
7769 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7770 * 3=ignore, 4=xmlcharrefreplace */
7771 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007772
7773 /* Default to Latin-1 */
7774 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007775 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007776
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007777 /* allocate enough for a simple encoding without
7778 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007779 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007780 if (res == NULL)
7781 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007782 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007783 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007784
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007785 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007786 /* try to encode it */
7787 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7788 if (x==enc_EXCEPTION) /* error */
7789 goto onError;
7790 if (x==enc_FAILED) { /* unencodable character */
7791 if (charmap_encoding_error(p, size, &inpos, mapping,
7792 &exc,
7793 &known_errorHandler, &errorHandler, errors,
7794 &res, &respos)) {
7795 goto onError;
7796 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007797 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007798 else
7799 /* done with this character => adjust input position */
7800 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007801 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007802
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007803 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007804 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007805 if (_PyBytes_Resize(&res, respos) < 0)
7806 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007807
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007808 Py_XDECREF(exc);
7809 Py_XDECREF(errorHandler);
7810 return res;
7811
Benjamin Peterson29060642009-01-31 22:14:21 +00007812 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007813 Py_XDECREF(res);
7814 Py_XDECREF(exc);
7815 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007816 return NULL;
7817}
7818
Alexander Belopolsky40018472011-02-26 01:02:56 +00007819PyObject *
7820PyUnicode_AsCharmapString(PyObject *unicode,
7821 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007822{
7823 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007824 PyErr_BadArgument();
7825 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007826 }
7827 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007828 PyUnicode_GET_SIZE(unicode),
7829 mapping,
7830 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007831}
7832
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007833/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007834static void
7835make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007836 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007837 Py_ssize_t startpos, Py_ssize_t endpos,
7838 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007839{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007840 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007841 *exceptionObject = _PyUnicodeTranslateError_Create(
7842 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007843 }
7844 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007845 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7846 goto onError;
7847 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7848 goto onError;
7849 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7850 goto onError;
7851 return;
7852 onError:
7853 Py_DECREF(*exceptionObject);
7854 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007855 }
7856}
7857
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007858/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007859static void
7860raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007861 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007862 Py_ssize_t startpos, Py_ssize_t endpos,
7863 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007864{
7865 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007866 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007867 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007868 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007869}
7870
7871/* error handling callback helper:
7872 build arguments, call the callback and check the arguments,
7873 put the result into newpos and return the replacement string, which
7874 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007875static PyObject *
7876unicode_translate_call_errorhandler(const char *errors,
7877 PyObject **errorHandler,
7878 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007879 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007880 Py_ssize_t startpos, Py_ssize_t endpos,
7881 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007882{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007883 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007884
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007885 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007886 PyObject *restuple;
7887 PyObject *resunicode;
7888
7889 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007890 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007891 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007892 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007893 }
7894
7895 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007896 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007897 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007898 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007899
7900 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007901 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007902 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007903 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007904 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007905 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007906 Py_DECREF(restuple);
7907 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007908 }
7909 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007910 &resunicode, &i_newpos)) {
7911 Py_DECREF(restuple);
7912 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007913 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007914 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007915 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007916 else
7917 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007918 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007919 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7920 Py_DECREF(restuple);
7921 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007922 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007923 Py_INCREF(resunicode);
7924 Py_DECREF(restuple);
7925 return resunicode;
7926}
7927
7928/* Lookup the character ch in the mapping and put the result in result,
7929 which must be decrefed by the caller.
7930 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007931static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007932charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007933{
Christian Heimes217cfd12007-12-02 14:31:20 +00007934 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007935 PyObject *x;
7936
7937 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007938 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007939 x = PyObject_GetItem(mapping, w);
7940 Py_DECREF(w);
7941 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007942 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7943 /* No mapping found means: use 1:1 mapping. */
7944 PyErr_Clear();
7945 *result = NULL;
7946 return 0;
7947 } else
7948 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007949 }
7950 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007951 *result = x;
7952 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007953 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007954 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007955 long value = PyLong_AS_LONG(x);
7956 long max = PyUnicode_GetMax();
7957 if (value < 0 || value > max) {
7958 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007959 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007960 Py_DECREF(x);
7961 return -1;
7962 }
7963 *result = x;
7964 return 0;
7965 }
7966 else if (PyUnicode_Check(x)) {
7967 *result = x;
7968 return 0;
7969 }
7970 else {
7971 /* wrong return value */
7972 PyErr_SetString(PyExc_TypeError,
7973 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007974 Py_DECREF(x);
7975 return -1;
7976 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007977}
7978/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00007979 if not reallocate and adjust various state variables.
7980 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007981static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007982charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00007983 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007984{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007985 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00007986 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007987 /* exponentially overallocate to minimize reallocations */
7988 if (requiredsize < 2 * oldsize)
7989 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007990 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7991 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007992 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007993 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007994 }
7995 return 0;
7996}
7997/* lookup the character, put the result in the output string and adjust
7998 various state variables. Return a new reference to the object that
7999 was put in the output buffer in *result, or Py_None, if the mapping was
8000 undefined (in which case no character was written).
8001 The called must decref result.
8002 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008003static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008004charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8005 PyObject *mapping, Py_UCS4 **output,
8006 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008007 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008008{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008009 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8010 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008011 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008012 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008013 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008014 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008015 }
8016 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008017 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008018 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008019 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008020 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008021 }
8022 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008023 Py_ssize_t repsize;
8024 if (PyUnicode_READY(*res) == -1)
8025 return -1;
8026 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008027 if (repsize==1) {
8028 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008029 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008030 }
8031 else if (repsize!=0) {
8032 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008033 Py_ssize_t requiredsize = *opos +
8034 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008035 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008036 Py_ssize_t i;
8037 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008038 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008039 for(i = 0; i < repsize; i++)
8040 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008041 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008042 }
8043 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008044 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008045 return 0;
8046}
8047
Alexander Belopolsky40018472011-02-26 01:02:56 +00008048PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008049_PyUnicode_TranslateCharmap(PyObject *input,
8050 PyObject *mapping,
8051 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008052{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008053 /* input object */
8054 char *idata;
8055 Py_ssize_t size, i;
8056 int kind;
8057 /* output buffer */
8058 Py_UCS4 *output = NULL;
8059 Py_ssize_t osize;
8060 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008061 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008062 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008063 char *reason = "character maps to <undefined>";
8064 PyObject *errorHandler = NULL;
8065 PyObject *exc = NULL;
8066 /* the following variable is used for caching string comparisons
8067 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8068 * 3=ignore, 4=xmlcharrefreplace */
8069 int known_errorHandler = -1;
8070
Guido van Rossumd57fd912000-03-10 22:53:23 +00008071 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008072 PyErr_BadArgument();
8073 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008074 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008075
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008076 if (PyUnicode_READY(input) == -1)
8077 return NULL;
8078 idata = (char*)PyUnicode_DATA(input);
8079 kind = PyUnicode_KIND(input);
8080 size = PyUnicode_GET_LENGTH(input);
8081 i = 0;
8082
8083 if (size == 0) {
8084 Py_INCREF(input);
8085 return input;
8086 }
8087
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008088 /* allocate enough for a simple 1:1 translation without
8089 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008090 osize = size;
8091 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8092 opos = 0;
8093 if (output == NULL) {
8094 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008095 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008096 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008097
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008098 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008099 /* try to encode it */
8100 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008101 if (charmaptranslate_output(input, i, mapping,
8102 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008103 Py_XDECREF(x);
8104 goto onError;
8105 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008106 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008107 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008108 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008109 else { /* untranslatable character */
8110 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8111 Py_ssize_t repsize;
8112 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008113 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008114 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008115 Py_ssize_t collstart = i;
8116 Py_ssize_t collend = i+1;
8117 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008118
Benjamin Peterson29060642009-01-31 22:14:21 +00008119 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008120 while (collend < size) {
8121 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008122 goto onError;
8123 Py_XDECREF(x);
8124 if (x!=Py_None)
8125 break;
8126 ++collend;
8127 }
8128 /* cache callback name lookup
8129 * (if not done yet, i.e. it's the first error) */
8130 if (known_errorHandler==-1) {
8131 if ((errors==NULL) || (!strcmp(errors, "strict")))
8132 known_errorHandler = 1;
8133 else if (!strcmp(errors, "replace"))
8134 known_errorHandler = 2;
8135 else if (!strcmp(errors, "ignore"))
8136 known_errorHandler = 3;
8137 else if (!strcmp(errors, "xmlcharrefreplace"))
8138 known_errorHandler = 4;
8139 else
8140 known_errorHandler = 0;
8141 }
8142 switch (known_errorHandler) {
8143 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008144 raise_translate_exception(&exc, input, collstart,
8145 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008146 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008147 case 2: /* replace */
8148 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008149 for (coll = collstart; coll<collend; coll++)
8150 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008151 /* fall through */
8152 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008153 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008154 break;
8155 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008156 /* generate replacement (temporarily (mis)uses i) */
8157 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008158 char buffer[2+29+1+1];
8159 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008160 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8161 if (charmaptranslate_makespace(&output, &osize,
8162 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008163 goto onError;
8164 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008165 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008166 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008167 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008168 break;
8169 default:
8170 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008171 reason, input, &exc,
8172 collstart, collend, &newpos);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02008173 if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00008174 goto onError;
8175 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008176 repsize = PyUnicode_GET_LENGTH(repunicode);
8177 if (charmaptranslate_makespace(&output, &osize,
8178 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008179 Py_DECREF(repunicode);
8180 goto onError;
8181 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008182 for (uni2 = 0; repsize-->0; ++uni2)
8183 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8184 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008185 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008186 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008187 }
8188 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008189 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8190 if (!res)
8191 goto onError;
8192 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008193 Py_XDECREF(exc);
8194 Py_XDECREF(errorHandler);
8195 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008196
Benjamin Peterson29060642009-01-31 22:14:21 +00008197 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008198 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008199 Py_XDECREF(exc);
8200 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008201 return NULL;
8202}
8203
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008204/* Deprecated. Use PyUnicode_Translate instead. */
8205PyObject *
8206PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8207 Py_ssize_t size,
8208 PyObject *mapping,
8209 const char *errors)
8210{
8211 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8212 if (!unicode)
8213 return NULL;
8214 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8215}
8216
Alexander Belopolsky40018472011-02-26 01:02:56 +00008217PyObject *
8218PyUnicode_Translate(PyObject *str,
8219 PyObject *mapping,
8220 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008221{
8222 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008223
Guido van Rossumd57fd912000-03-10 22:53:23 +00008224 str = PyUnicode_FromObject(str);
8225 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008226 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008227 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008228 Py_DECREF(str);
8229 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008230
Benjamin Peterson29060642009-01-31 22:14:21 +00008231 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008232 Py_XDECREF(str);
8233 return NULL;
8234}
Tim Petersced69f82003-09-16 20:30:58 +00008235
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008236static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008237fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008238{
8239 /* No need to call PyUnicode_READY(self) because this function is only
8240 called as a callback from fixup() which does it already. */
8241 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8242 const int kind = PyUnicode_KIND(self);
8243 void *data = PyUnicode_DATA(self);
8244 Py_UCS4 maxchar = 0, ch, fixed;
8245 Py_ssize_t i;
8246
8247 for (i = 0; i < len; ++i) {
8248 ch = PyUnicode_READ(kind, data, i);
8249 fixed = 0;
8250 if (ch > 127) {
8251 if (Py_UNICODE_ISSPACE(ch))
8252 fixed = ' ';
8253 else {
8254 const int decimal = Py_UNICODE_TODECIMAL(ch);
8255 if (decimal >= 0)
8256 fixed = '0' + decimal;
8257 }
8258 if (fixed != 0) {
8259 if (fixed > maxchar)
8260 maxchar = fixed;
8261 PyUnicode_WRITE(kind, data, i, fixed);
8262 }
8263 else if (ch > maxchar)
8264 maxchar = ch;
8265 }
8266 else if (ch > maxchar)
8267 maxchar = ch;
8268 }
8269
8270 return maxchar;
8271}
8272
8273PyObject *
8274_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8275{
8276 if (!PyUnicode_Check(unicode)) {
8277 PyErr_BadInternalCall();
8278 return NULL;
8279 }
8280 if (PyUnicode_READY(unicode) == -1)
8281 return NULL;
8282 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8283 /* If the string is already ASCII, just return the same string */
8284 Py_INCREF(unicode);
8285 return unicode;
8286 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008287 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008288}
8289
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008290PyObject *
8291PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8292 Py_ssize_t length)
8293{
8294 PyObject *result;
8295 Py_UNICODE *p; /* write pointer into result */
8296 Py_ssize_t i;
8297 /* Copy to a new string */
8298 result = (PyObject *)_PyUnicode_New(length);
8299 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
8300 if (result == NULL)
8301 return result;
8302 p = PyUnicode_AS_UNICODE(result);
8303 /* Iterate over code points */
8304 for (i = 0; i < length; i++) {
8305 Py_UNICODE ch =s[i];
8306 if (ch > 127) {
8307 int decimal = Py_UNICODE_TODECIMAL(ch);
8308 if (decimal >= 0)
8309 p[i] = '0' + decimal;
8310 }
8311 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008312#ifndef DONT_MAKE_RESULT_READY
8313 if (_PyUnicode_READY_REPLACE(&result)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008314 Py_DECREF(result);
8315 return NULL;
8316 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008317#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02008318 assert(_PyUnicode_CheckConsistency(result, 1));
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008319 return result;
8320}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008321/* --- Decimal Encoder ---------------------------------------------------- */
8322
Alexander Belopolsky40018472011-02-26 01:02:56 +00008323int
8324PyUnicode_EncodeDecimal(Py_UNICODE *s,
8325 Py_ssize_t length,
8326 char *output,
8327 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008328{
8329 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008330 PyObject *errorHandler = NULL;
8331 PyObject *exc = NULL;
8332 const char *encoding = "decimal";
8333 const char *reason = "invalid decimal Unicode string";
8334 /* the following variable is used for caching string comparisons
8335 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
8336 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008337
8338 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008339 PyErr_BadArgument();
8340 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008341 }
8342
8343 p = s;
8344 end = s + length;
8345 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008346 register Py_UNICODE ch = *p;
8347 int decimal;
8348 PyObject *repunicode;
8349 Py_ssize_t repsize;
8350 Py_ssize_t newpos;
8351 Py_UNICODE *uni2;
8352 Py_UNICODE *collstart;
8353 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00008354
Benjamin Peterson29060642009-01-31 22:14:21 +00008355 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008356 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00008357 ++p;
8358 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008359 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008360 decimal = Py_UNICODE_TODECIMAL(ch);
8361 if (decimal >= 0) {
8362 *output++ = '0' + decimal;
8363 ++p;
8364 continue;
8365 }
8366 if (0 < ch && ch < 256) {
8367 *output++ = (char)ch;
8368 ++p;
8369 continue;
8370 }
8371 /* All other characters are considered unencodable */
8372 collstart = p;
8373 collend = p+1;
8374 while (collend < end) {
8375 if ((0 < *collend && *collend < 256) ||
8376 !Py_UNICODE_ISSPACE(*collend) ||
8377 Py_UNICODE_TODECIMAL(*collend))
8378 break;
8379 }
8380 /* cache callback name lookup
8381 * (if not done yet, i.e. it's the first error) */
8382 if (known_errorHandler==-1) {
8383 if ((errors==NULL) || (!strcmp(errors, "strict")))
8384 known_errorHandler = 1;
8385 else if (!strcmp(errors, "replace"))
8386 known_errorHandler = 2;
8387 else if (!strcmp(errors, "ignore"))
8388 known_errorHandler = 3;
8389 else if (!strcmp(errors, "xmlcharrefreplace"))
8390 known_errorHandler = 4;
8391 else
8392 known_errorHandler = 0;
8393 }
8394 switch (known_errorHandler) {
8395 case 1: /* strict */
8396 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
8397 goto onError;
8398 case 2: /* replace */
8399 for (p = collstart; p < collend; ++p)
8400 *output++ = '?';
8401 /* fall through */
8402 case 3: /* ignore */
8403 p = collend;
8404 break;
8405 case 4: /* xmlcharrefreplace */
8406 /* generate replacement (temporarily (mis)uses p) */
8407 for (p = collstart; p < collend; ++p)
8408 output += sprintf(output, "&#%d;", (int)*p);
8409 p = collend;
8410 break;
8411 default:
8412 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
8413 encoding, reason, s, length, &exc,
8414 collstart-s, collend-s, &newpos);
8415 if (repunicode == NULL)
8416 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008417 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008418 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008419 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8420 Py_DECREF(repunicode);
8421 goto onError;
8422 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008423 /* generate replacement */
8424 repsize = PyUnicode_GET_SIZE(repunicode);
8425 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8426 Py_UNICODE ch = *uni2;
8427 if (Py_UNICODE_ISSPACE(ch))
8428 *output++ = ' ';
8429 else {
8430 decimal = Py_UNICODE_TODECIMAL(ch);
8431 if (decimal >= 0)
8432 *output++ = '0' + decimal;
8433 else if (0 < ch && ch < 256)
8434 *output++ = (char)ch;
8435 else {
8436 Py_DECREF(repunicode);
8437 raise_encode_exception(&exc, encoding,
8438 s, length, collstart-s, collend-s, reason);
8439 goto onError;
8440 }
8441 }
8442 }
8443 p = s + newpos;
8444 Py_DECREF(repunicode);
8445 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00008446 }
8447 /* 0-terminate the output string */
8448 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008449 Py_XDECREF(exc);
8450 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008451 return 0;
8452
Benjamin Peterson29060642009-01-31 22:14:21 +00008453 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008454 Py_XDECREF(exc);
8455 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008456 return -1;
8457}
8458
Guido van Rossumd57fd912000-03-10 22:53:23 +00008459/* --- Helpers ------------------------------------------------------------ */
8460
Victor Stinnerc3cec782011-10-05 21:24:08 +02008461#include "stringlib/asciilib.h"
8462#include "stringlib/fastsearch.h"
8463#include "stringlib/partition.h"
8464#include "stringlib/split.h"
8465#include "stringlib/count.h"
8466#include "stringlib/find.h"
8467#include "stringlib/localeutil.h"
8468#include "stringlib/undef.h"
8469
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008470#include "stringlib/ucs1lib.h"
8471#include "stringlib/fastsearch.h"
8472#include "stringlib/partition.h"
8473#include "stringlib/split.h"
8474#include "stringlib/count.h"
8475#include "stringlib/find.h"
8476#include "stringlib/localeutil.h"
8477#include "stringlib/undef.h"
8478
8479#include "stringlib/ucs2lib.h"
8480#include "stringlib/fastsearch.h"
8481#include "stringlib/partition.h"
8482#include "stringlib/split.h"
8483#include "stringlib/count.h"
8484#include "stringlib/find.h"
8485#include "stringlib/localeutil.h"
8486#include "stringlib/undef.h"
8487
8488#include "stringlib/ucs4lib.h"
8489#include "stringlib/fastsearch.h"
8490#include "stringlib/partition.h"
8491#include "stringlib/split.h"
8492#include "stringlib/count.h"
8493#include "stringlib/find.h"
8494#include "stringlib/localeutil.h"
8495#include "stringlib/undef.h"
8496
8497static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02008498any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ascii)(const Py_UCS1*, Py_ssize_t,
8499 const Py_UCS1*, Py_ssize_t,
8500 Py_ssize_t, Py_ssize_t),
8501 Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008502 const Py_UCS1*, Py_ssize_t,
8503 Py_ssize_t, Py_ssize_t),
8504 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
8505 const Py_UCS2*, Py_ssize_t,
8506 Py_ssize_t, Py_ssize_t),
8507 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
8508 const Py_UCS4*, Py_ssize_t,
8509 Py_ssize_t, Py_ssize_t),
8510 PyObject* s1, PyObject* s2,
8511 Py_ssize_t start,
8512 Py_ssize_t end)
8513{
8514 int kind1, kind2, kind;
8515 void *buf1, *buf2;
8516 Py_ssize_t len1, len2, result;
8517
8518 kind1 = PyUnicode_KIND(s1);
8519 kind2 = PyUnicode_KIND(s2);
8520 kind = kind1 > kind2 ? kind1 : kind2;
8521 buf1 = PyUnicode_DATA(s1);
8522 buf2 = PyUnicode_DATA(s2);
8523 if (kind1 != kind)
8524 buf1 = _PyUnicode_AsKind(s1, kind);
8525 if (!buf1)
8526 return -2;
8527 if (kind2 != kind)
8528 buf2 = _PyUnicode_AsKind(s2, kind);
8529 if (!buf2) {
8530 if (kind1 != kind) PyMem_Free(buf1);
8531 return -2;
8532 }
8533 len1 = PyUnicode_GET_LENGTH(s1);
8534 len2 = PyUnicode_GET_LENGTH(s2);
8535
8536 switch(kind) {
8537 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008538 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8539 result = ascii(buf1, len1, buf2, len2, start, end);
8540 else
8541 result = ucs1(buf1, len1, buf2, len2, start, end);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008542 break;
8543 case PyUnicode_2BYTE_KIND:
8544 result = ucs2(buf1, len1, buf2, len2, start, end);
8545 break;
8546 case PyUnicode_4BYTE_KIND:
8547 result = ucs4(buf1, len1, buf2, len2, start, end);
8548 break;
8549 default:
8550 assert(0); result = -2;
8551 }
8552
8553 if (kind1 != kind)
8554 PyMem_Free(buf1);
8555 if (kind2 != kind)
8556 PyMem_Free(buf2);
8557
8558 return result;
8559}
8560
8561Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02008562_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008563 Py_ssize_t n_buffer,
8564 void *digits, Py_ssize_t n_digits,
8565 Py_ssize_t min_width,
8566 const char *grouping,
8567 const char *thousands_sep)
8568{
8569 switch(kind) {
8570 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008571 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
8572 return _PyUnicode_ascii_InsertThousandsGrouping(
8573 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8574 min_width, grouping, thousands_sep);
8575 else
8576 return _PyUnicode_ucs1_InsertThousandsGrouping(
8577 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8578 min_width, grouping, thousands_sep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008579 case PyUnicode_2BYTE_KIND:
8580 return _PyUnicode_ucs2_InsertThousandsGrouping(
8581 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8582 min_width, grouping, thousands_sep);
8583 case PyUnicode_4BYTE_KIND:
8584 return _PyUnicode_ucs4_InsertThousandsGrouping(
8585 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8586 min_width, grouping, thousands_sep);
8587 }
8588 assert(0);
8589 return -1;
8590}
8591
8592
Eric Smith8c663262007-08-25 02:26:07 +00008593#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00008594#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008595
Thomas Wouters477c8d52006-05-27 19:21:47 +00008596#include "stringlib/count.h"
8597#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00008598
Thomas Wouters477c8d52006-05-27 19:21:47 +00008599/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008600#define ADJUST_INDICES(start, end, len) \
8601 if (end > len) \
8602 end = len; \
8603 else if (end < 0) { \
8604 end += len; \
8605 if (end < 0) \
8606 end = 0; \
8607 } \
8608 if (start < 0) { \
8609 start += len; \
8610 if (start < 0) \
8611 start = 0; \
8612 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008613
Alexander Belopolsky40018472011-02-26 01:02:56 +00008614Py_ssize_t
8615PyUnicode_Count(PyObject *str,
8616 PyObject *substr,
8617 Py_ssize_t start,
8618 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008619{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008620 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008621 PyUnicodeObject* str_obj;
8622 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008623 int kind1, kind2, kind;
8624 void *buf1 = NULL, *buf2 = NULL;
8625 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008626
Thomas Wouters477c8d52006-05-27 19:21:47 +00008627 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008628 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008629 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008630 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02008631 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008632 Py_DECREF(str_obj);
8633 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008634 }
Tim Petersced69f82003-09-16 20:30:58 +00008635
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008636 kind1 = PyUnicode_KIND(str_obj);
8637 kind2 = PyUnicode_KIND(sub_obj);
8638 kind = kind1 > kind2 ? kind1 : kind2;
8639 buf1 = PyUnicode_DATA(str_obj);
8640 if (kind1 != kind)
8641 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
8642 if (!buf1)
8643 goto onError;
8644 buf2 = PyUnicode_DATA(sub_obj);
8645 if (kind2 != kind)
8646 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
8647 if (!buf2)
8648 goto onError;
8649 len1 = PyUnicode_GET_LENGTH(str_obj);
8650 len2 = PyUnicode_GET_LENGTH(sub_obj);
8651
8652 ADJUST_INDICES(start, end, len1);
8653 switch(kind) {
8654 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008655 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
8656 result = asciilib_count(
8657 ((Py_UCS1*)buf1) + start, end - start,
8658 buf2, len2, PY_SSIZE_T_MAX
8659 );
8660 else
8661 result = ucs1lib_count(
8662 ((Py_UCS1*)buf1) + start, end - start,
8663 buf2, len2, PY_SSIZE_T_MAX
8664 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008665 break;
8666 case PyUnicode_2BYTE_KIND:
8667 result = ucs2lib_count(
8668 ((Py_UCS2*)buf1) + start, end - start,
8669 buf2, len2, PY_SSIZE_T_MAX
8670 );
8671 break;
8672 case PyUnicode_4BYTE_KIND:
8673 result = ucs4lib_count(
8674 ((Py_UCS4*)buf1) + start, end - start,
8675 buf2, len2, PY_SSIZE_T_MAX
8676 );
8677 break;
8678 default:
8679 assert(0); result = 0;
8680 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008681
8682 Py_DECREF(sub_obj);
8683 Py_DECREF(str_obj);
8684
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008685 if (kind1 != kind)
8686 PyMem_Free(buf1);
8687 if (kind2 != kind)
8688 PyMem_Free(buf2);
8689
Guido van Rossumd57fd912000-03-10 22:53:23 +00008690 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008691 onError:
8692 Py_DECREF(sub_obj);
8693 Py_DECREF(str_obj);
8694 if (kind1 != kind && buf1)
8695 PyMem_Free(buf1);
8696 if (kind2 != kind && buf2)
8697 PyMem_Free(buf2);
8698 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008699}
8700
Alexander Belopolsky40018472011-02-26 01:02:56 +00008701Py_ssize_t
8702PyUnicode_Find(PyObject *str,
8703 PyObject *sub,
8704 Py_ssize_t start,
8705 Py_ssize_t end,
8706 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008707{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008708 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008709
Guido van Rossumd57fd912000-03-10 22:53:23 +00008710 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008711 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008712 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008713 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008714 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008715 Py_DECREF(str);
8716 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008717 }
Tim Petersced69f82003-09-16 20:30:58 +00008718
Thomas Wouters477c8d52006-05-27 19:21:47 +00008719 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008720 result = any_find_slice(
Victor Stinnerc3cec782011-10-05 21:24:08 +02008721 asciilib_find_slice, ucs1lib_find_slice,
8722 ucs2lib_find_slice, ucs4lib_find_slice,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008723 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008724 );
8725 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008726 result = any_find_slice(
Victor Stinnerc3cec782011-10-05 21:24:08 +02008727 asciilib_find_slice, ucs1lib_rfind_slice,
8728 ucs2lib_rfind_slice, ucs4lib_rfind_slice,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008729 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008730 );
8731
Guido van Rossumd57fd912000-03-10 22:53:23 +00008732 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008733 Py_DECREF(sub);
8734
Guido van Rossumd57fd912000-03-10 22:53:23 +00008735 return result;
8736}
8737
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008738Py_ssize_t
8739PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8740 Py_ssize_t start, Py_ssize_t end,
8741 int direction)
8742{
8743 char *result;
8744 int kind;
8745 if (PyUnicode_READY(str) == -1)
8746 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008747 if (start < 0 || end < 0) {
8748 PyErr_SetString(PyExc_IndexError, "string index out of range");
8749 return -2;
8750 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008751 if (end > PyUnicode_GET_LENGTH(str))
8752 end = PyUnicode_GET_LENGTH(str);
8753 kind = PyUnicode_KIND(str);
8754 result = findchar(PyUnicode_1BYTE_DATA(str)
8755 + PyUnicode_KIND_SIZE(kind, start),
8756 kind,
8757 end-start, ch, direction);
8758 if (!result)
8759 return -1;
8760 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8761}
8762
Alexander Belopolsky40018472011-02-26 01:02:56 +00008763static int
8764tailmatch(PyUnicodeObject *self,
8765 PyUnicodeObject *substring,
8766 Py_ssize_t start,
8767 Py_ssize_t end,
8768 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008769{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008770 int kind_self;
8771 int kind_sub;
8772 void *data_self;
8773 void *data_sub;
8774 Py_ssize_t offset;
8775 Py_ssize_t i;
8776 Py_ssize_t end_sub;
8777
8778 if (PyUnicode_READY(self) == -1 ||
8779 PyUnicode_READY(substring) == -1)
8780 return 0;
8781
8782 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008783 return 1;
8784
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008785 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8786 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008787 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008788 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008789
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008790 kind_self = PyUnicode_KIND(self);
8791 data_self = PyUnicode_DATA(self);
8792 kind_sub = PyUnicode_KIND(substring);
8793 data_sub = PyUnicode_DATA(substring);
8794 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8795
8796 if (direction > 0)
8797 offset = end;
8798 else
8799 offset = start;
8800
8801 if (PyUnicode_READ(kind_self, data_self, offset) ==
8802 PyUnicode_READ(kind_sub, data_sub, 0) &&
8803 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8804 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8805 /* If both are of the same kind, memcmp is sufficient */
8806 if (kind_self == kind_sub) {
8807 return ! memcmp((char *)data_self +
8808 (offset * PyUnicode_CHARACTER_SIZE(substring)),
8809 data_sub,
8810 PyUnicode_GET_LENGTH(substring) *
8811 PyUnicode_CHARACTER_SIZE(substring));
8812 }
8813 /* otherwise we have to compare each character by first accesing it */
8814 else {
8815 /* We do not need to compare 0 and len(substring)-1 because
8816 the if statement above ensured already that they are equal
8817 when we end up here. */
8818 // TODO: honor direction and do a forward or backwards search
8819 for (i = 1; i < end_sub; ++i) {
8820 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8821 PyUnicode_READ(kind_sub, data_sub, i))
8822 return 0;
8823 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008824 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008825 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008826 }
8827
8828 return 0;
8829}
8830
Alexander Belopolsky40018472011-02-26 01:02:56 +00008831Py_ssize_t
8832PyUnicode_Tailmatch(PyObject *str,
8833 PyObject *substr,
8834 Py_ssize_t start,
8835 Py_ssize_t end,
8836 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008837{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008838 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008839
Guido van Rossumd57fd912000-03-10 22:53:23 +00008840 str = PyUnicode_FromObject(str);
8841 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008842 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008843 substr = PyUnicode_FromObject(substr);
8844 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008845 Py_DECREF(str);
8846 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008847 }
Tim Petersced69f82003-09-16 20:30:58 +00008848
Guido van Rossumd57fd912000-03-10 22:53:23 +00008849 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008850 (PyUnicodeObject *)substr,
8851 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008852 Py_DECREF(str);
8853 Py_DECREF(substr);
8854 return result;
8855}
8856
Guido van Rossumd57fd912000-03-10 22:53:23 +00008857/* Apply fixfct filter to the Unicode object self and return a
8858 reference to the modified object */
8859
Alexander Belopolsky40018472011-02-26 01:02:56 +00008860static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02008861fixup(PyObject *self,
8862 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008863{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008864 PyObject *u;
8865 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008866
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008867 if (PyUnicode_READY(self) == -1)
8868 return NULL;
8869 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8870 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8871 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008872 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008873 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008874
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008875 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8876 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008877
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008878 /* fix functions return the new maximum character in a string,
8879 if the kind of the resulting unicode object does not change,
8880 everything is fine. Otherwise we need to change the string kind
8881 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02008882 maxchar_new = fixfct(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008883 if (maxchar_new == 0)
8884 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8885 else if (maxchar_new <= 127)
8886 maxchar_new = 127;
8887 else if (maxchar_new <= 255)
8888 maxchar_new = 255;
8889 else if (maxchar_new <= 65535)
8890 maxchar_new = 65535;
8891 else
8892 maxchar_new = 1114111; /* 0x10ffff */
8893
8894 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008895 /* fixfct should return TRUE if it modified the buffer. If
8896 FALSE, return a reference to the original buffer instead
8897 (to save space, not time) */
8898 Py_INCREF(self);
8899 Py_DECREF(u);
8900 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008901 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008902 else if (maxchar_new == maxchar_old) {
8903 return u;
8904 }
8905 else {
8906 /* In case the maximum character changed, we need to
8907 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008908 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008909 if (v == NULL) {
8910 Py_DECREF(u);
8911 return NULL;
8912 }
8913 if (maxchar_new > maxchar_old) {
8914 /* If the maxchar increased so that the kind changed, not all
8915 characters are representable anymore and we need to fix the
8916 string again. This only happens in very few cases. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02008917 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner9310abb2011-10-05 00:59:23 +02008918 maxchar_old = fixfct(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008919 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8920 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008921 else {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02008922 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008923 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008924
8925 Py_DECREF(u);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02008926 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008927 return v;
8928 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008929}
8930
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008931static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008932fixupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008933{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008934 /* No need to call PyUnicode_READY(self) because this function is only
8935 called as a callback from fixup() which does it already. */
8936 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8937 const int kind = PyUnicode_KIND(self);
8938 void *data = PyUnicode_DATA(self);
8939 int touched = 0;
8940 Py_UCS4 maxchar = 0;
8941 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008942
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008943 for (i = 0; i < len; ++i) {
8944 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8945 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8946 if (up != ch) {
8947 if (up > maxchar)
8948 maxchar = up;
8949 PyUnicode_WRITE(kind, data, i, up);
8950 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008951 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008952 else if (ch > maxchar)
8953 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008954 }
8955
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008956 if (touched)
8957 return maxchar;
8958 else
8959 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008960}
8961
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008962static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008963fixlower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008964{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008965 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8966 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8967 const int kind = PyUnicode_KIND(self);
8968 void *data = PyUnicode_DATA(self);
8969 int touched = 0;
8970 Py_UCS4 maxchar = 0;
8971 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008972
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008973 for(i = 0; i < len; ++i) {
8974 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8975 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8976 if (lo != ch) {
8977 if (lo > maxchar)
8978 maxchar = lo;
8979 PyUnicode_WRITE(kind, data, i, lo);
8980 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008981 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008982 else if (ch > maxchar)
8983 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008984 }
8985
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008986 if (touched)
8987 return maxchar;
8988 else
8989 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008990}
8991
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008992static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008993fixswapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008994{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008995 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8996 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8997 const int kind = PyUnicode_KIND(self);
8998 void *data = PyUnicode_DATA(self);
8999 int touched = 0;
9000 Py_UCS4 maxchar = 0;
9001 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009002
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009003 for(i = 0; i < len; ++i) {
9004 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9005 Py_UCS4 nu = 0;
9006
9007 if (Py_UNICODE_ISUPPER(ch))
9008 nu = Py_UNICODE_TOLOWER(ch);
9009 else if (Py_UNICODE_ISLOWER(ch))
9010 nu = Py_UNICODE_TOUPPER(ch);
9011
9012 if (nu != 0) {
9013 if (nu > maxchar)
9014 maxchar = nu;
9015 PyUnicode_WRITE(kind, data, i, nu);
9016 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009017 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009018 else if (ch > maxchar)
9019 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009020 }
9021
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009022 if (touched)
9023 return maxchar;
9024 else
9025 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009026}
9027
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009028static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009029fixcapitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009030{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009031 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9032 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9033 const int kind = PyUnicode_KIND(self);
9034 void *data = PyUnicode_DATA(self);
9035 int touched = 0;
9036 Py_UCS4 maxchar = 0;
9037 Py_ssize_t i = 0;
9038 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00009039
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009040 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009041 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009042
9043 ch = PyUnicode_READ(kind, data, i);
9044 if (!Py_UNICODE_ISUPPER(ch)) {
9045 maxchar = Py_UNICODE_TOUPPER(ch);
9046 PyUnicode_WRITE(kind, data, i, maxchar);
9047 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009048 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009049 ++i;
9050 for(; i < len; ++i) {
9051 ch = PyUnicode_READ(kind, data, i);
9052 if (!Py_UNICODE_ISLOWER(ch)) {
9053 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9054 if (lo > maxchar)
9055 maxchar = lo;
9056 PyUnicode_WRITE(kind, data, i, lo);
9057 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009058 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009059 else if (ch > maxchar)
9060 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009061 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009062
9063 if (touched)
9064 return maxchar;
9065 else
9066 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009067}
9068
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009069static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009070fixtitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009071{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009072 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9073 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9074 const int kind = PyUnicode_KIND(self);
9075 void *data = PyUnicode_DATA(self);
9076 Py_UCS4 maxchar = 0;
9077 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009078 int previous_is_cased;
9079
9080 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009081 if (len == 1) {
9082 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9083 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
9084 if (ti != ch) {
9085 PyUnicode_WRITE(kind, data, i, ti);
9086 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00009087 }
9088 else
9089 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009090 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009091 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009092 for(; i < len; ++i) {
9093 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9094 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00009095
Benjamin Peterson29060642009-01-31 22:14:21 +00009096 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009097 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00009098 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009099 nu = Py_UNICODE_TOTITLE(ch);
9100
9101 if (nu > maxchar)
9102 maxchar = nu;
9103 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00009104
Benjamin Peterson29060642009-01-31 22:14:21 +00009105 if (Py_UNICODE_ISLOWER(ch) ||
9106 Py_UNICODE_ISUPPER(ch) ||
9107 Py_UNICODE_ISTITLE(ch))
9108 previous_is_cased = 1;
9109 else
9110 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009111 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009112 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009113}
9114
Tim Peters8ce9f162004-08-27 01:49:32 +00009115PyObject *
9116PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009117{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009118 PyObject *sep = NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009119 Py_ssize_t seplen = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009120 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009121 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009122 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9123 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009124 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009125 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009126 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009127 Py_UCS4 item_maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009128
Tim Peters05eba1f2004-08-27 21:32:02 +00009129 fseq = PySequence_Fast(seq, "");
9130 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009131 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009132 }
9133
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009134 /* NOTE: the following code can't call back into Python code,
9135 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009136 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009137
Tim Peters05eba1f2004-08-27 21:32:02 +00009138 seqlen = PySequence_Fast_GET_SIZE(fseq);
9139 /* If empty sequence, return u"". */
9140 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009141 Py_DECREF(fseq);
9142 Py_INCREF(unicode_empty);
9143 res = unicode_empty;
9144 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009145 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009146
Tim Peters05eba1f2004-08-27 21:32:02 +00009147 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009148 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009149 if (seqlen == 1) {
9150 if (PyUnicode_CheckExact(items[0])) {
9151 res = items[0];
9152 Py_INCREF(res);
9153 Py_DECREF(fseq);
9154 return res;
9155 }
9156 sep = NULL;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009157 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009158 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009159 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009160 /* Set up sep and seplen */
9161 if (separator == NULL) {
9162 /* fall back to a blank space separator */
9163 sep = PyUnicode_FromOrdinal(' ');
9164 if (!sep)
9165 goto onError;
9166 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009167 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009168 else {
9169 if (!PyUnicode_Check(separator)) {
9170 PyErr_Format(PyExc_TypeError,
9171 "separator: expected str instance,"
9172 " %.80s found",
9173 Py_TYPE(separator)->tp_name);
9174 goto onError;
9175 }
9176 if (PyUnicode_READY(separator))
9177 goto onError;
9178 sep = separator;
9179 seplen = PyUnicode_GET_LENGTH(separator);
9180 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9181 /* inc refcount to keep this code path symmetric with the
9182 above case of a blank separator */
9183 Py_INCREF(sep);
9184 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009185 }
9186
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009187 /* There are at least two things to join, or else we have a subclass
9188 * of str in the sequence.
9189 * Do a pre-pass to figure out the total amount of space we'll
9190 * need (sz), and see whether all argument are strings.
9191 */
9192 sz = 0;
9193 for (i = 0; i < seqlen; i++) {
9194 const Py_ssize_t old_sz = sz;
9195 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009196 if (!PyUnicode_Check(item)) {
9197 PyErr_Format(PyExc_TypeError,
9198 "sequence item %zd: expected str instance,"
9199 " %.80s found",
9200 i, Py_TYPE(item)->tp_name);
9201 goto onError;
9202 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009203 if (PyUnicode_READY(item) == -1)
9204 goto onError;
9205 sz += PyUnicode_GET_LENGTH(item);
9206 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009207 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009208 if (i != 0)
9209 sz += seplen;
9210 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9211 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009212 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009213 goto onError;
9214 }
9215 }
Tim Petersced69f82003-09-16 20:30:58 +00009216
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009217 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009218 if (res == NULL)
9219 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009220
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009221 /* Catenate everything. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009222 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009223 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009224 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009225 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009226 if (i && seplen != 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009227 copy_characters(res, res_offset, sep, 0, seplen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009228 res_offset += seplen;
Benjamin Peterson29060642009-01-31 22:14:21 +00009229 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009230 itemlen = PyUnicode_GET_LENGTH(item);
9231 if (itemlen != 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009232 copy_characters(res, res_offset, item, 0, itemlen);
Victor Stinner9ce5a832011-10-03 23:36:02 +02009233 res_offset += itemlen;
Victor Stinner9ce5a832011-10-03 23:36:02 +02009234 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009235 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009236 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009237
Tim Peters05eba1f2004-08-27 21:32:02 +00009238 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009239 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009240 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009241 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009242
Benjamin Peterson29060642009-01-31 22:14:21 +00009243 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009244 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009245 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009246 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009247 return NULL;
9248}
9249
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009250#define FILL(kind, data, value, start, length) \
9251 do { \
9252 Py_ssize_t i_ = 0; \
9253 assert(kind != PyUnicode_WCHAR_KIND); \
9254 switch ((kind)) { \
9255 case PyUnicode_1BYTE_KIND: { \
9256 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9257 memset(to_, (unsigned char)value, length); \
9258 break; \
9259 } \
9260 case PyUnicode_2BYTE_KIND: { \
9261 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9262 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9263 break; \
9264 } \
9265 default: { \
9266 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9267 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9268 break; \
9269 } \
9270 } \
9271 } while (0)
9272
Victor Stinner9310abb2011-10-05 00:59:23 +02009273static PyObject *
9274pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009275 Py_ssize_t left,
9276 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009277 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009278{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009279 PyObject *u;
9280 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009281 int kind;
9282 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009283
9284 if (left < 0)
9285 left = 0;
9286 if (right < 0)
9287 right = 0;
9288
Tim Peters7a29bd52001-09-12 03:03:31 +00009289 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009290 Py_INCREF(self);
9291 return self;
9292 }
9293
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009294 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9295 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009296 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9297 return NULL;
9298 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009299 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9300 if (fill > maxchar)
9301 maxchar = fill;
9302 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009303 if (!u)
9304 return NULL;
9305
9306 kind = PyUnicode_KIND(u);
9307 data = PyUnicode_DATA(u);
9308 if (left)
9309 FILL(kind, data, fill, 0, left);
9310 if (right)
9311 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009312 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009313 assert(_PyUnicode_CheckConsistency(u, 1));
9314 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009315}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009316#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009317
Alexander Belopolsky40018472011-02-26 01:02:56 +00009318PyObject *
9319PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009320{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009321 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009322
9323 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009324 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009325 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009326
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009327 switch(PyUnicode_KIND(string)) {
9328 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009329 if (PyUnicode_IS_ASCII(string))
9330 list = asciilib_splitlines(
9331 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9332 PyUnicode_GET_LENGTH(string), keepends);
9333 else
9334 list = ucs1lib_splitlines(
9335 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9336 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009337 break;
9338 case PyUnicode_2BYTE_KIND:
9339 list = ucs2lib_splitlines(
9340 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
9341 PyUnicode_GET_LENGTH(string), keepends);
9342 break;
9343 case PyUnicode_4BYTE_KIND:
9344 list = ucs4lib_splitlines(
9345 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
9346 PyUnicode_GET_LENGTH(string), keepends);
9347 break;
9348 default:
9349 assert(0);
9350 list = 0;
9351 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009352 Py_DECREF(string);
9353 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009354}
9355
Alexander Belopolsky40018472011-02-26 01:02:56 +00009356static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009357split(PyObject *self,
9358 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009359 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009360{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009361 int kind1, kind2, kind;
9362 void *buf1, *buf2;
9363 Py_ssize_t len1, len2;
9364 PyObject* out;
9365
Guido van Rossumd57fd912000-03-10 22:53:23 +00009366 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009367 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009368
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009369 if (PyUnicode_READY(self) == -1)
9370 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009371
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009372 if (substring == NULL)
9373 switch(PyUnicode_KIND(self)) {
9374 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009375 if (PyUnicode_IS_ASCII(self))
9376 return asciilib_split_whitespace(
9377 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9378 PyUnicode_GET_LENGTH(self), maxcount
9379 );
9380 else
9381 return ucs1lib_split_whitespace(
9382 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9383 PyUnicode_GET_LENGTH(self), maxcount
9384 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009385 case PyUnicode_2BYTE_KIND:
9386 return ucs2lib_split_whitespace(
9387 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9388 PyUnicode_GET_LENGTH(self), maxcount
9389 );
9390 case PyUnicode_4BYTE_KIND:
9391 return ucs4lib_split_whitespace(
9392 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9393 PyUnicode_GET_LENGTH(self), maxcount
9394 );
9395 default:
9396 assert(0);
9397 return NULL;
9398 }
9399
9400 if (PyUnicode_READY(substring) == -1)
9401 return NULL;
9402
9403 kind1 = PyUnicode_KIND(self);
9404 kind2 = PyUnicode_KIND(substring);
9405 kind = kind1 > kind2 ? kind1 : kind2;
9406 buf1 = PyUnicode_DATA(self);
9407 buf2 = PyUnicode_DATA(substring);
9408 if (kind1 != kind)
9409 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9410 if (!buf1)
9411 return NULL;
9412 if (kind2 != kind)
9413 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9414 if (!buf2) {
9415 if (kind1 != kind) PyMem_Free(buf1);
9416 return NULL;
9417 }
9418 len1 = PyUnicode_GET_LENGTH(self);
9419 len2 = PyUnicode_GET_LENGTH(substring);
9420
9421 switch(kind) {
9422 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009423 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9424 out = asciilib_split(
9425 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9426 else
9427 out = ucs1lib_split(
9428 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009429 break;
9430 case PyUnicode_2BYTE_KIND:
9431 out = ucs2lib_split(
9432 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9433 break;
9434 case PyUnicode_4BYTE_KIND:
9435 out = ucs4lib_split(
9436 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9437 break;
9438 default:
9439 out = NULL;
9440 }
9441 if (kind1 != kind)
9442 PyMem_Free(buf1);
9443 if (kind2 != kind)
9444 PyMem_Free(buf2);
9445 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009446}
9447
Alexander Belopolsky40018472011-02-26 01:02:56 +00009448static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009449rsplit(PyObject *self,
9450 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009451 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009452{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009453 int kind1, kind2, kind;
9454 void *buf1, *buf2;
9455 Py_ssize_t len1, len2;
9456 PyObject* out;
9457
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009458 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009459 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009460
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009461 if (PyUnicode_READY(self) == -1)
9462 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009463
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009464 if (substring == NULL)
9465 switch(PyUnicode_KIND(self)) {
9466 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009467 if (PyUnicode_IS_ASCII(self))
9468 return asciilib_rsplit_whitespace(
9469 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9470 PyUnicode_GET_LENGTH(self), maxcount
9471 );
9472 else
9473 return ucs1lib_rsplit_whitespace(
9474 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9475 PyUnicode_GET_LENGTH(self), maxcount
9476 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009477 case PyUnicode_2BYTE_KIND:
9478 return ucs2lib_rsplit_whitespace(
9479 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9480 PyUnicode_GET_LENGTH(self), maxcount
9481 );
9482 case PyUnicode_4BYTE_KIND:
9483 return ucs4lib_rsplit_whitespace(
9484 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9485 PyUnicode_GET_LENGTH(self), maxcount
9486 );
9487 default:
9488 assert(0);
9489 return NULL;
9490 }
9491
9492 if (PyUnicode_READY(substring) == -1)
9493 return NULL;
9494
9495 kind1 = PyUnicode_KIND(self);
9496 kind2 = PyUnicode_KIND(substring);
9497 kind = kind1 > kind2 ? kind1 : kind2;
9498 buf1 = PyUnicode_DATA(self);
9499 buf2 = PyUnicode_DATA(substring);
9500 if (kind1 != kind)
9501 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9502 if (!buf1)
9503 return NULL;
9504 if (kind2 != kind)
9505 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9506 if (!buf2) {
9507 if (kind1 != kind) PyMem_Free(buf1);
9508 return NULL;
9509 }
9510 len1 = PyUnicode_GET_LENGTH(self);
9511 len2 = PyUnicode_GET_LENGTH(substring);
9512
9513 switch(kind) {
9514 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009515 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9516 out = asciilib_rsplit(
9517 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9518 else
9519 out = ucs1lib_rsplit(
9520 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009521 break;
9522 case PyUnicode_2BYTE_KIND:
9523 out = ucs2lib_rsplit(
9524 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9525 break;
9526 case PyUnicode_4BYTE_KIND:
9527 out = ucs4lib_rsplit(
9528 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9529 break;
9530 default:
9531 out = NULL;
9532 }
9533 if (kind1 != kind)
9534 PyMem_Free(buf1);
9535 if (kind2 != kind)
9536 PyMem_Free(buf2);
9537 return out;
9538}
9539
9540static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009541anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9542 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009543{
9544 switch(kind) {
9545 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009546 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9547 return asciilib_find(buf1, len1, buf2, len2, offset);
9548 else
9549 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009550 case PyUnicode_2BYTE_KIND:
9551 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9552 case PyUnicode_4BYTE_KIND:
9553 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9554 }
9555 assert(0);
9556 return -1;
9557}
9558
9559static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009560anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
9561 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009562{
9563 switch(kind) {
9564 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009565 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
9566 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
9567 else
9568 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009569 case PyUnicode_2BYTE_KIND:
9570 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9571 case PyUnicode_4BYTE_KIND:
9572 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9573 }
9574 assert(0);
9575 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009576}
9577
Alexander Belopolsky40018472011-02-26 01:02:56 +00009578static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009579replace(PyObject *self, PyObject *str1,
9580 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009581{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009582 PyObject *u;
9583 char *sbuf = PyUnicode_DATA(self);
9584 char *buf1 = PyUnicode_DATA(str1);
9585 char *buf2 = PyUnicode_DATA(str2);
9586 int srelease = 0, release1 = 0, release2 = 0;
9587 int skind = PyUnicode_KIND(self);
9588 int kind1 = PyUnicode_KIND(str1);
9589 int kind2 = PyUnicode_KIND(str2);
9590 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9591 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9592 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009593
9594 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009595 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009596 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009597 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009598
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009599 if (skind < kind1)
9600 /* substring too wide to be present */
9601 goto nothing;
9602
9603 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00009604 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009605 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009606 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009607 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009608 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009609 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009610 Py_UCS4 u1, u2, maxchar;
9611 int mayshrink, rkind;
9612 u1 = PyUnicode_READ_CHAR(str1, 0);
9613 if (!findchar(sbuf, PyUnicode_KIND(self),
9614 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00009615 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009616 u2 = PyUnicode_READ_CHAR(str2, 0);
9617 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9618 /* Replacing u1 with u2 may cause a maxchar reduction in the
9619 result string. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009620 if (u2 > maxchar) {
9621 maxchar = u2;
9622 mayshrink = 0;
9623 }
Victor Stinnerb9275c12011-10-05 14:01:42 +02009624 else
9625 mayshrink = maxchar > 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009626 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009627 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009628 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009629 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009630 rkind = PyUnicode_KIND(u);
9631 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
9632 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009633 if (--maxcount < 0)
9634 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009635 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009636 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009637 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +02009638 unicode_adjust_maxchar(&u);
9639 if (u == NULL)
9640 goto error;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009641 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009642 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009643 int rkind = skind;
9644 char *res;
Victor Stinner25a4b292011-10-06 12:31:55 +02009645 PyObject *rstr;
9646 Py_UCS4 maxchar;
9647
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009648 if (kind1 < rkind) {
9649 /* widen substring */
9650 buf1 = _PyUnicode_AsKind(str1, rkind);
9651 if (!buf1) goto error;
9652 release1 = 1;
9653 }
Victor Stinnerc3cec782011-10-05 21:24:08 +02009654 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009655 if (i < 0)
9656 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009657 if (rkind > kind2) {
9658 /* widen replacement */
9659 buf2 = _PyUnicode_AsKind(str2, rkind);
9660 if (!buf2) goto error;
9661 release2 = 1;
9662 }
9663 else if (rkind < kind2) {
9664 /* widen self and buf1 */
9665 rkind = kind2;
9666 if (release1) PyMem_Free(buf1);
9667 sbuf = _PyUnicode_AsKind(self, rkind);
9668 if (!sbuf) goto error;
9669 srelease = 1;
9670 buf1 = _PyUnicode_AsKind(str1, rkind);
9671 if (!buf1) goto error;
9672 release1 = 1;
9673 }
Victor Stinner25a4b292011-10-06 12:31:55 +02009674 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9675 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(str2));
9676 rstr = PyUnicode_New(slen, maxchar);
9677 if (!rstr)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009678 goto error;
Victor Stinner25a4b292011-10-06 12:31:55 +02009679 res = PyUnicode_DATA(rstr);
9680
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009681 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009682 /* change everything in-place, starting with this one */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009683 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9684 buf2,
9685 PyUnicode_KIND_SIZE(rkind, len2));
9686 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009687
9688 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +02009689 i = anylib_find(rkind, self,
9690 sbuf+PyUnicode_KIND_SIZE(rkind, i), slen-i,
9691 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009692 if (i == -1)
9693 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009694 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9695 buf2,
9696 PyUnicode_KIND_SIZE(rkind, len2));
9697 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009698 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009699
Victor Stinner25a4b292011-10-06 12:31:55 +02009700 u = rstr;
9701 unicode_adjust_maxchar(&u);
9702 if (!u)
9703 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009704 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009705 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009706
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009707 Py_ssize_t n, i, j, ires;
9708 Py_ssize_t product, new_size;
9709 int rkind = skind;
Victor Stinner25a4b292011-10-06 12:31:55 +02009710 PyObject *rstr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009711 char *res;
Victor Stinner25a4b292011-10-06 12:31:55 +02009712 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009713
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009714 if (kind1 < rkind) {
9715 buf1 = _PyUnicode_AsKind(str1, rkind);
9716 if (!buf1) goto error;
9717 release1 = 1;
9718 }
Victor Stinnerc3cec782011-10-05 21:24:08 +02009719 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009720 if (n == 0)
9721 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009722 if (kind2 < rkind) {
9723 buf2 = _PyUnicode_AsKind(str2, rkind);
9724 if (!buf2) goto error;
9725 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009726 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009727 else if (kind2 > rkind) {
9728 rkind = kind2;
9729 sbuf = _PyUnicode_AsKind(self, rkind);
9730 if (!sbuf) goto error;
9731 srelease = 1;
9732 if (release1) PyMem_Free(buf1);
9733 buf1 = _PyUnicode_AsKind(str1, rkind);
9734 if (!buf1) goto error;
9735 release1 = 1;
9736 }
9737 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9738 PyUnicode_GET_LENGTH(str1))); */
9739 product = n * (len2-len1);
9740 if ((product / (len2-len1)) != n) {
9741 PyErr_SetString(PyExc_OverflowError,
9742 "replace string is too long");
9743 goto error;
9744 }
9745 new_size = slen + product;
9746 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9747 PyErr_SetString(PyExc_OverflowError,
9748 "replace string is too long");
9749 goto error;
9750 }
Victor Stinner25a4b292011-10-06 12:31:55 +02009751 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9752 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(str2));
9753 rstr = PyUnicode_New(new_size, maxchar);
9754 if (!rstr)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009755 goto error;
Victor Stinner25a4b292011-10-06 12:31:55 +02009756 res = PyUnicode_DATA(rstr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009757 ires = i = 0;
9758 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009759 while (n-- > 0) {
9760 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +02009761 j = anylib_find(rkind, self,
9762 sbuf + PyUnicode_KIND_SIZE(rkind, i), slen-i,
9763 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009764 if (j == -1)
9765 break;
9766 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009767 /* copy unchanged part [i:j] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009768 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9769 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9770 PyUnicode_KIND_SIZE(rkind, j-i));
9771 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009772 }
9773 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009774 if (len2 > 0) {
9775 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9776 buf2,
9777 PyUnicode_KIND_SIZE(rkind, len2));
9778 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009779 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009780 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009781 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009782 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009783 /* copy tail [i:] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009784 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9785 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9786 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009787 } else {
9788 /* interleave */
9789 while (n > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009790 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9791 buf2,
9792 PyUnicode_KIND_SIZE(rkind, len2));
9793 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009794 if (--n <= 0)
9795 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009796 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9797 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9798 PyUnicode_KIND_SIZE(rkind, 1));
9799 ires++;
9800 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009801 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009802 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9803 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9804 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009805 }
Victor Stinner25a4b292011-10-06 12:31:55 +02009806 u = rstr;
9807 unicode_adjust_maxchar(&u);
9808 if (u == NULL)
9809 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009810 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009811 if (srelease)
9812 PyMem_FREE(sbuf);
9813 if (release1)
9814 PyMem_FREE(buf1);
9815 if (release2)
9816 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009817 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009818 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009819
Benjamin Peterson29060642009-01-31 22:14:21 +00009820 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009821 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009822 if (srelease)
9823 PyMem_FREE(sbuf);
9824 if (release1)
9825 PyMem_FREE(buf1);
9826 if (release2)
9827 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009828 if (PyUnicode_CheckExact(self)) {
9829 Py_INCREF(self);
9830 return (PyObject *) self;
9831 }
Victor Stinner034f6cf2011-09-30 02:26:44 +02009832 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009833 error:
9834 if (srelease && sbuf)
9835 PyMem_FREE(sbuf);
9836 if (release1 && buf1)
9837 PyMem_FREE(buf1);
9838 if (release2 && buf2)
9839 PyMem_FREE(buf2);
9840 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009841}
9842
9843/* --- Unicode Object Methods --------------------------------------------- */
9844
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009845PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009846 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009847\n\
9848Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009849characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009850
9851static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +02009852unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009853{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009854 return fixup(self, fixtitle);
9855}
9856
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009857PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009858 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009859\n\
9860Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009861have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009862
9863static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +02009864unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009865{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009866 return fixup(self, fixcapitalize);
9867}
9868
9869#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009870PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009871 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009872\n\
9873Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009874normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009875
9876static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009877unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009878{
9879 PyObject *list;
9880 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009881 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009882
Guido van Rossumd57fd912000-03-10 22:53:23 +00009883 /* Split into words */
9884 list = split(self, NULL, -1);
9885 if (!list)
9886 return NULL;
9887
9888 /* Capitalize each word */
9889 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9890 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009891 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009892 if (item == NULL)
9893 goto onError;
9894 Py_DECREF(PyList_GET_ITEM(list, i));
9895 PyList_SET_ITEM(list, i, item);
9896 }
9897
9898 /* Join the words to form a new string */
9899 item = PyUnicode_Join(NULL, list);
9900
Benjamin Peterson29060642009-01-31 22:14:21 +00009901 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009902 Py_DECREF(list);
9903 return (PyObject *)item;
9904}
9905#endif
9906
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009907/* Argument converter. Coerces to a single unicode character */
9908
9909static int
9910convert_uc(PyObject *obj, void *addr)
9911{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009912 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009913 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009914
Benjamin Peterson14339b62009-01-31 16:36:08 +00009915 uniobj = PyUnicode_FromObject(obj);
9916 if (uniobj == NULL) {
9917 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009918 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009919 return 0;
9920 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009921 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009922 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009923 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009924 Py_DECREF(uniobj);
9925 return 0;
9926 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009927 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009928 Py_DECREF(uniobj);
9929 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009930}
9931
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009932PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009933 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009934\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009935Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009936done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009937
9938static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009939unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009940{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009941 Py_ssize_t marg, left;
9942 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009943 Py_UCS4 fillchar = ' ';
9944
Victor Stinnere9a29352011-10-01 02:14:59 +02009945 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009946 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009947
Victor Stinnere9a29352011-10-01 02:14:59 +02009948 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009949 return NULL;
9950
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009951 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009952 Py_INCREF(self);
9953 return (PyObject*) self;
9954 }
9955
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009956 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009957 left = marg / 2 + (marg & width & 1);
9958
Victor Stinner9310abb2011-10-05 00:59:23 +02009959 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009960}
9961
Marc-André Lemburge5034372000-08-08 08:04:29 +00009962#if 0
9963
9964/* This code should go into some future Unicode collation support
9965 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00009966 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00009967
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009968/* speedy UTF-16 code point order comparison */
9969/* gleaned from: */
9970/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9971
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009972static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009973{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009974 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00009975 0, 0, 0, 0, 0, 0, 0, 0,
9976 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009977 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009978};
9979
Guido van Rossumd57fd912000-03-10 22:53:23 +00009980static int
9981unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9982{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009983 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009984
Guido van Rossumd57fd912000-03-10 22:53:23 +00009985 Py_UNICODE *s1 = str1->str;
9986 Py_UNICODE *s2 = str2->str;
9987
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009988 len1 = str1->_base._base.length;
9989 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +00009990
Guido van Rossumd57fd912000-03-10 22:53:23 +00009991 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00009992 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009993
9994 c1 = *s1++;
9995 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00009996
Benjamin Peterson29060642009-01-31 22:14:21 +00009997 if (c1 > (1<<11) * 26)
9998 c1 += utf16Fixup[c1>>11];
9999 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +000010000 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +000010001 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +000010002
10003 if (c1 != c2)
10004 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +000010005
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +000010006 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010007 }
10008
10009 return (len1 < len2) ? -1 : (len1 != len2);
10010}
10011
Marc-André Lemburge5034372000-08-08 08:04:29 +000010012#else
10013
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010014/* This function assumes that str1 and str2 are readied by the caller. */
10015
Marc-André Lemburge5034372000-08-08 08:04:29 +000010016static int
10017unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
10018{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010019 int kind1, kind2;
10020 void *data1, *data2;
10021 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010022
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010023 kind1 = PyUnicode_KIND(str1);
10024 kind2 = PyUnicode_KIND(str2);
10025 data1 = PyUnicode_DATA(str1);
10026 data2 = PyUnicode_DATA(str2);
10027 len1 = PyUnicode_GET_LENGTH(str1);
10028 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010029
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010030 for (i = 0; i < len1 && i < len2; ++i) {
10031 Py_UCS4 c1, c2;
10032 c1 = PyUnicode_READ(kind1, data1, i);
10033 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010034
10035 if (c1 != c2)
10036 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010037 }
10038
10039 return (len1 < len2) ? -1 : (len1 != len2);
10040}
10041
10042#endif
10043
Alexander Belopolsky40018472011-02-26 01:02:56 +000010044int
10045PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010046{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010047 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10048 if (PyUnicode_READY(left) == -1 ||
10049 PyUnicode_READY(right) == -1)
10050 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010051 return unicode_compare((PyUnicodeObject *)left,
10052 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010053 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010054 PyErr_Format(PyExc_TypeError,
10055 "Can't compare %.100s and %.100s",
10056 left->ob_type->tp_name,
10057 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010058 return -1;
10059}
10060
Martin v. Löwis5b222132007-06-10 09:51:05 +000010061int
10062PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10063{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010064 Py_ssize_t i;
10065 int kind;
10066 void *data;
10067 Py_UCS4 chr;
10068
Victor Stinner910337b2011-10-03 03:20:16 +020010069 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010070 if (PyUnicode_READY(uni) == -1)
10071 return -1;
10072 kind = PyUnicode_KIND(uni);
10073 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010074 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010075 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10076 if (chr != str[i])
10077 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010078 /* This check keeps Python strings that end in '\0' from comparing equal
10079 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010080 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010081 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010082 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010083 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010084 return 0;
10085}
10086
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010087
Benjamin Peterson29060642009-01-31 22:14:21 +000010088#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010089 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010090
Alexander Belopolsky40018472011-02-26 01:02:56 +000010091PyObject *
10092PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010093{
10094 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010095
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010096 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10097 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010098 if (PyUnicode_READY(left) == -1 ||
10099 PyUnicode_READY(right) == -1)
10100 return NULL;
10101 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10102 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010103 if (op == Py_EQ) {
10104 Py_INCREF(Py_False);
10105 return Py_False;
10106 }
10107 if (op == Py_NE) {
10108 Py_INCREF(Py_True);
10109 return Py_True;
10110 }
10111 }
10112 if (left == right)
10113 result = 0;
10114 else
10115 result = unicode_compare((PyUnicodeObject *)left,
10116 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010117
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010118 /* Convert the return value to a Boolean */
10119 switch (op) {
10120 case Py_EQ:
10121 v = TEST_COND(result == 0);
10122 break;
10123 case Py_NE:
10124 v = TEST_COND(result != 0);
10125 break;
10126 case Py_LE:
10127 v = TEST_COND(result <= 0);
10128 break;
10129 case Py_GE:
10130 v = TEST_COND(result >= 0);
10131 break;
10132 case Py_LT:
10133 v = TEST_COND(result == -1);
10134 break;
10135 case Py_GT:
10136 v = TEST_COND(result == 1);
10137 break;
10138 default:
10139 PyErr_BadArgument();
10140 return NULL;
10141 }
10142 Py_INCREF(v);
10143 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010144 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010145
Brian Curtindfc80e32011-08-10 20:28:54 -050010146 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010147}
10148
Alexander Belopolsky40018472011-02-26 01:02:56 +000010149int
10150PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010151{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010152 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010153 int kind1, kind2, kind;
10154 void *buf1, *buf2;
10155 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010156 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010157
10158 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010159 sub = PyUnicode_FromObject(element);
10160 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010161 PyErr_Format(PyExc_TypeError,
10162 "'in <string>' requires string as left operand, not %s",
10163 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010164 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010165 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010166 if (PyUnicode_READY(sub) == -1)
10167 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010168
Thomas Wouters477c8d52006-05-27 19:21:47 +000010169 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +020010170 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010171 Py_DECREF(sub);
10172 return -1;
10173 }
10174
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010175 kind1 = PyUnicode_KIND(str);
10176 kind2 = PyUnicode_KIND(sub);
10177 kind = kind1 > kind2 ? kind1 : kind2;
10178 buf1 = PyUnicode_DATA(str);
10179 buf2 = PyUnicode_DATA(sub);
10180 if (kind1 != kind)
10181 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
10182 if (!buf1) {
10183 Py_DECREF(sub);
10184 return -1;
10185 }
10186 if (kind2 != kind)
10187 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
10188 if (!buf2) {
10189 Py_DECREF(sub);
10190 if (kind1 != kind) PyMem_Free(buf1);
10191 return -1;
10192 }
10193 len1 = PyUnicode_GET_LENGTH(str);
10194 len2 = PyUnicode_GET_LENGTH(sub);
10195
10196 switch(kind) {
10197 case PyUnicode_1BYTE_KIND:
10198 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10199 break;
10200 case PyUnicode_2BYTE_KIND:
10201 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10202 break;
10203 case PyUnicode_4BYTE_KIND:
10204 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10205 break;
10206 default:
10207 result = -1;
10208 assert(0);
10209 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010210
10211 Py_DECREF(str);
10212 Py_DECREF(sub);
10213
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010214 if (kind1 != kind)
10215 PyMem_Free(buf1);
10216 if (kind2 != kind)
10217 PyMem_Free(buf2);
10218
Guido van Rossum403d68b2000-03-13 15:55:09 +000010219 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010220}
10221
Guido van Rossumd57fd912000-03-10 22:53:23 +000010222/* Concat to string or Unicode object giving a new Unicode object. */
10223
Alexander Belopolsky40018472011-02-26 01:02:56 +000010224PyObject *
10225PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010226{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010227 PyObject *u = NULL, *v = NULL, *w;
10228 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010229
10230 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010231 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010232 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010233 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010234 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010235 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010236 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010237
10238 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010239 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010240 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010241 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010242 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010243 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010244 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010245 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010246 }
10247
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010248 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +020010249 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010250
Guido van Rossumd57fd912000-03-10 22:53:23 +000010251 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010252 w = PyUnicode_New(
10253 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
10254 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010255 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010256 goto onError;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010257 copy_characters(w, 0, u, 0, PyUnicode_GET_LENGTH(u));
10258 copy_characters(w, PyUnicode_GET_LENGTH(u), v, 0, PyUnicode_GET_LENGTH(v));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010259 Py_DECREF(u);
10260 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010261 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010262 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010263
Benjamin Peterson29060642009-01-31 22:14:21 +000010264 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010265 Py_XDECREF(u);
10266 Py_XDECREF(v);
10267 return NULL;
10268}
10269
Victor Stinnerb0923652011-10-04 01:17:31 +020010270static void
10271unicode_append_inplace(PyObject **p_left, PyObject *right)
10272{
10273 Py_ssize_t left_len, right_len, new_len;
Victor Stinnerb0923652011-10-04 01:17:31 +020010274
10275 assert(PyUnicode_IS_READY(*p_left));
10276 assert(PyUnicode_IS_READY(right));
10277
10278 left_len = PyUnicode_GET_LENGTH(*p_left);
10279 right_len = PyUnicode_GET_LENGTH(right);
10280 if (left_len > PY_SSIZE_T_MAX - right_len) {
10281 PyErr_SetString(PyExc_OverflowError,
10282 "strings are too large to concat");
10283 goto error;
10284 }
10285 new_len = left_len + right_len;
10286
10287 /* Now we own the last reference to 'left', so we can resize it
10288 * in-place.
10289 */
10290 if (unicode_resize(p_left, new_len) != 0) {
10291 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10292 * deallocated so it cannot be put back into
10293 * 'variable'. The MemoryError is raised when there
10294 * is no value in 'variable', which might (very
10295 * remotely) be a cause of incompatibilities.
10296 */
10297 goto error;
10298 }
10299 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010300 copy_characters(*p_left, left_len, right, 0, right_len);
10301 _PyUnicode_DIRTY(*p_left);
Victor Stinnerb0923652011-10-04 01:17:31 +020010302 return;
10303
10304error:
10305 Py_DECREF(*p_left);
10306 *p_left = NULL;
10307}
10308
Walter Dörwald1ab83302007-05-18 17:15:44 +000010309void
Victor Stinner23e56682011-10-03 03:54:37 +020010310PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010311{
Victor Stinner23e56682011-10-03 03:54:37 +020010312 PyObject *left, *res;
10313
10314 if (p_left == NULL) {
10315 if (!PyErr_Occurred())
10316 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010317 return;
10318 }
Victor Stinner23e56682011-10-03 03:54:37 +020010319 left = *p_left;
10320 if (right == NULL || !PyUnicode_Check(left)) {
10321 if (!PyErr_Occurred())
10322 PyErr_BadInternalCall();
10323 goto error;
10324 }
10325
Victor Stinnere1335c72011-10-04 20:53:03 +020010326 if (PyUnicode_READY(left))
10327 goto error;
10328 if (PyUnicode_READY(right))
10329 goto error;
10330
Victor Stinner23e56682011-10-03 03:54:37 +020010331 if (PyUnicode_CheckExact(left) && left != unicode_empty
10332 && PyUnicode_CheckExact(right) && right != unicode_empty
10333 && unicode_resizable(left)
10334 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
10335 || _PyUnicode_WSTR(left) != NULL))
10336 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010337 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10338 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010339 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010340 not so different than duplicating the string. */
10341 if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
Victor Stinner23e56682011-10-03 03:54:37 +020010342 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010343 unicode_append_inplace(p_left, right);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010344 if (p_left != NULL)
10345 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010346 return;
10347 }
10348 }
10349
10350 res = PyUnicode_Concat(left, right);
10351 if (res == NULL)
10352 goto error;
10353 Py_DECREF(left);
10354 *p_left = res;
10355 return;
10356
10357error:
10358 Py_DECREF(*p_left);
10359 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +000010360}
10361
10362void
10363PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10364{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010365 PyUnicode_Append(pleft, right);
10366 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010367}
10368
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010369PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010370 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010371\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010372Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010373string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010374interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010375
10376static PyObject *
10377unicode_count(PyUnicodeObject *self, PyObject *args)
10378{
10379 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010380 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010381 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010382 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010383 int kind1, kind2, kind;
10384 void *buf1, *buf2;
10385 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010386
Jesus Ceaac451502011-04-20 17:09:23 +020010387 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10388 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010389 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010390
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010391 kind1 = PyUnicode_KIND(self);
10392 kind2 = PyUnicode_KIND(substring);
10393 kind = kind1 > kind2 ? kind1 : kind2;
10394 buf1 = PyUnicode_DATA(self);
10395 buf2 = PyUnicode_DATA(substring);
10396 if (kind1 != kind)
10397 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
10398 if (!buf1) {
10399 Py_DECREF(substring);
10400 return NULL;
10401 }
10402 if (kind2 != kind)
10403 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
10404 if (!buf2) {
10405 Py_DECREF(substring);
10406 if (kind1 != kind) PyMem_Free(buf1);
10407 return NULL;
10408 }
10409 len1 = PyUnicode_GET_LENGTH(self);
10410 len2 = PyUnicode_GET_LENGTH(substring);
10411
10412 ADJUST_INDICES(start, end, len1);
10413 switch(kind) {
10414 case PyUnicode_1BYTE_KIND:
10415 iresult = ucs1lib_count(
10416 ((Py_UCS1*)buf1) + start, end - start,
10417 buf2, len2, PY_SSIZE_T_MAX
10418 );
10419 break;
10420 case PyUnicode_2BYTE_KIND:
10421 iresult = ucs2lib_count(
10422 ((Py_UCS2*)buf1) + start, end - start,
10423 buf2, len2, PY_SSIZE_T_MAX
10424 );
10425 break;
10426 case PyUnicode_4BYTE_KIND:
10427 iresult = ucs4lib_count(
10428 ((Py_UCS4*)buf1) + start, end - start,
10429 buf2, len2, PY_SSIZE_T_MAX
10430 );
10431 break;
10432 default:
10433 assert(0); iresult = 0;
10434 }
10435
10436 result = PyLong_FromSsize_t(iresult);
10437
10438 if (kind1 != kind)
10439 PyMem_Free(buf1);
10440 if (kind2 != kind)
10441 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010442
10443 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010444
Guido van Rossumd57fd912000-03-10 22:53:23 +000010445 return result;
10446}
10447
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010448PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010449 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010450\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010451Encode S using the codec registered for encoding. Default encoding\n\
10452is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010453handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010454a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10455'xmlcharrefreplace' as well as any other name registered with\n\
10456codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010457
10458static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +000010459unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010460{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010461 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010462 char *encoding = NULL;
10463 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010464
Benjamin Peterson308d6372009-09-18 21:42:35 +000010465 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10466 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010467 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +000010468 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010469}
10470
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010471PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010472 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010473\n\
10474Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010475If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010476
10477static PyObject*
10478unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
10479{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010480 Py_ssize_t i, j, line_pos, src_len, incr;
10481 Py_UCS4 ch;
10482 PyObject *u;
10483 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010484 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010485 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010486 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010487
10488 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010489 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010490
Antoine Pitrou22425222011-10-04 19:10:51 +020010491 if (PyUnicode_READY(self) == -1)
10492 return NULL;
10493
Thomas Wouters7e474022000-07-16 12:04:32 +000010494 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010495 src_len = PyUnicode_GET_LENGTH(self);
10496 i = j = line_pos = 0;
10497 kind = PyUnicode_KIND(self);
10498 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010499 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010500 for (; i < src_len; i++) {
10501 ch = PyUnicode_READ(kind, src_data, i);
10502 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010503 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010504 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010505 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010506 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010507 goto overflow;
10508 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010509 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010510 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010511 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010512 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010513 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010514 goto overflow;
10515 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010516 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010517 if (ch == '\n' || ch == '\r')
10518 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010519 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010520 }
Antoine Pitroue19aa382011-10-04 16:04:01 +020010521 if (!found && PyUnicode_CheckExact(self)) {
10522 Py_INCREF((PyObject *) self);
10523 return (PyObject *) self;
10524 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010525
Guido van Rossumd57fd912000-03-10 22:53:23 +000010526 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010527 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010528 if (!u)
10529 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010530 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010531
Antoine Pitroue71d5742011-10-04 15:55:09 +020010532 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010533
Antoine Pitroue71d5742011-10-04 15:55:09 +020010534 for (; i < src_len; i++) {
10535 ch = PyUnicode_READ(kind, src_data, i);
10536 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010537 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010538 incr = tabsize - (line_pos % tabsize);
10539 line_pos += incr;
10540 while (incr--) {
10541 PyUnicode_WRITE(kind, dest_data, j, ' ');
10542 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010543 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010544 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010545 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010546 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010547 line_pos++;
10548 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010549 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010550 if (ch == '\n' || ch == '\r')
10551 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010552 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010553 }
10554 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinner17efeed2011-10-04 20:05:46 +020010555#ifndef DONT_MAKE_RESULT_READY
10556 if (_PyUnicode_READY_REPLACE(&u)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010557 Py_DECREF(u);
10558 return NULL;
10559 }
Victor Stinner17efeed2011-10-04 20:05:46 +020010560#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010561 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010562 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010563
Antoine Pitroue71d5742011-10-04 15:55:09 +020010564 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010565 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10566 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010567}
10568
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010569PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010570 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010571\n\
10572Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010573such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010574arguments start and end are interpreted as in slice notation.\n\
10575\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010576Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010577
10578static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010579unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010580{
Jesus Ceaac451502011-04-20 17:09:23 +020010581 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010582 Py_ssize_t start;
10583 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010584 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010585
Jesus Ceaac451502011-04-20 17:09:23 +020010586 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10587 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010588 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010589
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010590 if (PyUnicode_READY(self) == -1)
10591 return NULL;
10592 if (PyUnicode_READY(substring) == -1)
10593 return NULL;
10594
10595 result = any_find_slice(
Victor Stinnerc3cec782011-10-05 21:24:08 +020010596 asciilib_find_slice, ucs1lib_find_slice,
10597 ucs2lib_find_slice, ucs4lib_find_slice,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010598 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010599 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010600
10601 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010602
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010603 if (result == -2)
10604 return NULL;
10605
Christian Heimes217cfd12007-12-02 14:31:20 +000010606 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010607}
10608
10609static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010610unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010611{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010612 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10613 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010614 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010615 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010616}
10617
Guido van Rossumc2504932007-09-18 19:42:40 +000010618/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010619 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010620static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +000010621unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010622{
Guido van Rossumc2504932007-09-18 19:42:40 +000010623 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010624 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010625
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010626 if (_PyUnicode_HASH(self) != -1)
10627 return _PyUnicode_HASH(self);
10628 if (PyUnicode_READY(self) == -1)
10629 return -1;
10630 len = PyUnicode_GET_LENGTH(self);
10631
10632 /* The hash function as a macro, gets expanded three times below. */
10633#define HASH(P) \
10634 x = (Py_uhash_t)*P << 7; \
10635 while (--len >= 0) \
10636 x = (1000003*x) ^ (Py_uhash_t)*P++;
10637
10638 switch (PyUnicode_KIND(self)) {
10639 case PyUnicode_1BYTE_KIND: {
10640 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10641 HASH(c);
10642 break;
10643 }
10644 case PyUnicode_2BYTE_KIND: {
10645 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10646 HASH(s);
10647 break;
10648 }
10649 default: {
10650 Py_UCS4 *l;
10651 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10652 "Impossible switch case in unicode_hash");
10653 l = PyUnicode_4BYTE_DATA(self);
10654 HASH(l);
10655 break;
10656 }
10657 }
10658 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
10659
Guido van Rossumc2504932007-09-18 19:42:40 +000010660 if (x == -1)
10661 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010662 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010663 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010664}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010665#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000010666
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010667PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010668 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010669\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010670Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010671
10672static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010673unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010674{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010675 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +020010676 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010677 Py_ssize_t start;
10678 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010679
Jesus Ceaac451502011-04-20 17:09:23 +020010680 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10681 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010682 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010683
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010684 if (PyUnicode_READY(self) == -1)
10685 return NULL;
10686 if (PyUnicode_READY(substring) == -1)
10687 return NULL;
10688
10689 result = any_find_slice(
Victor Stinnerc3cec782011-10-05 21:24:08 +020010690 asciilib_find_slice, ucs1lib_find_slice,
10691 ucs2lib_find_slice, ucs4lib_find_slice,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010692 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010693 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010694
10695 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010696
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010697 if (result == -2)
10698 return NULL;
10699
Guido van Rossumd57fd912000-03-10 22:53:23 +000010700 if (result < 0) {
10701 PyErr_SetString(PyExc_ValueError, "substring not found");
10702 return NULL;
10703 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010704
Christian Heimes217cfd12007-12-02 14:31:20 +000010705 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010706}
10707
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010708PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010709 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010710\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010711Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010712at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010713
10714static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010715unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010716{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010717 Py_ssize_t i, length;
10718 int kind;
10719 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010720 int cased;
10721
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010722 if (PyUnicode_READY(self) == -1)
10723 return NULL;
10724 length = PyUnicode_GET_LENGTH(self);
10725 kind = PyUnicode_KIND(self);
10726 data = PyUnicode_DATA(self);
10727
Guido van Rossumd57fd912000-03-10 22:53:23 +000010728 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010729 if (length == 1)
10730 return PyBool_FromLong(
10731 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010732
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010733 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010734 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010735 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010736
Guido van Rossumd57fd912000-03-10 22:53:23 +000010737 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010738 for (i = 0; i < length; i++) {
10739 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010740
Benjamin Peterson29060642009-01-31 22:14:21 +000010741 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
10742 return PyBool_FromLong(0);
10743 else if (!cased && Py_UNICODE_ISLOWER(ch))
10744 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010745 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010746 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010747}
10748
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010749PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010750 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010751\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010752Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010753at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010754
10755static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010756unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010757{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010758 Py_ssize_t i, length;
10759 int kind;
10760 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010761 int cased;
10762
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010763 if (PyUnicode_READY(self) == -1)
10764 return NULL;
10765 length = PyUnicode_GET_LENGTH(self);
10766 kind = PyUnicode_KIND(self);
10767 data = PyUnicode_DATA(self);
10768
Guido van Rossumd57fd912000-03-10 22:53:23 +000010769 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010770 if (length == 1)
10771 return PyBool_FromLong(
10772 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010773
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010774 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010775 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010776 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010777
Guido van Rossumd57fd912000-03-10 22:53:23 +000010778 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010779 for (i = 0; i < length; i++) {
10780 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010781
Benjamin Peterson29060642009-01-31 22:14:21 +000010782 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
10783 return PyBool_FromLong(0);
10784 else if (!cased && Py_UNICODE_ISUPPER(ch))
10785 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010786 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010787 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010788}
10789
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010790PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010791 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010792\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010793Return True if S is a titlecased string and there is at least one\n\
10794character in S, i.e. upper- and titlecase characters may only\n\
10795follow uncased characters and lowercase characters only cased ones.\n\
10796Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010797
10798static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010799unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010800{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010801 Py_ssize_t i, length;
10802 int kind;
10803 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010804 int cased, previous_is_cased;
10805
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010806 if (PyUnicode_READY(self) == -1)
10807 return NULL;
10808 length = PyUnicode_GET_LENGTH(self);
10809 kind = PyUnicode_KIND(self);
10810 data = PyUnicode_DATA(self);
10811
Guido van Rossumd57fd912000-03-10 22:53:23 +000010812 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010813 if (length == 1) {
10814 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10815 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10816 (Py_UNICODE_ISUPPER(ch) != 0));
10817 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010818
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010819 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010820 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010821 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010822
Guido van Rossumd57fd912000-03-10 22:53:23 +000010823 cased = 0;
10824 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010825 for (i = 0; i < length; i++) {
10826 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010827
Benjamin Peterson29060642009-01-31 22:14:21 +000010828 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10829 if (previous_is_cased)
10830 return PyBool_FromLong(0);
10831 previous_is_cased = 1;
10832 cased = 1;
10833 }
10834 else if (Py_UNICODE_ISLOWER(ch)) {
10835 if (!previous_is_cased)
10836 return PyBool_FromLong(0);
10837 previous_is_cased = 1;
10838 cased = 1;
10839 }
10840 else
10841 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010842 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010843 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010844}
10845
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010846PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010847 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010848\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010849Return True if all characters in S are whitespace\n\
10850and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010851
10852static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010853unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010854{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010855 Py_ssize_t i, length;
10856 int kind;
10857 void *data;
10858
10859 if (PyUnicode_READY(self) == -1)
10860 return NULL;
10861 length = PyUnicode_GET_LENGTH(self);
10862 kind = PyUnicode_KIND(self);
10863 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010864
Guido van Rossumd57fd912000-03-10 22:53:23 +000010865 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010866 if (length == 1)
10867 return PyBool_FromLong(
10868 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010869
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010870 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010871 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010872 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010873
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010874 for (i = 0; i < length; i++) {
10875 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010876 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010877 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010878 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010879 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010880}
10881
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010882PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010883 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010884\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010885Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010886and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010887
10888static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010889unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010890{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010891 Py_ssize_t i, length;
10892 int kind;
10893 void *data;
10894
10895 if (PyUnicode_READY(self) == -1)
10896 return NULL;
10897 length = PyUnicode_GET_LENGTH(self);
10898 kind = PyUnicode_KIND(self);
10899 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010900
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010901 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010902 if (length == 1)
10903 return PyBool_FromLong(
10904 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010905
10906 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010907 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010908 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010909
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010910 for (i = 0; i < length; i++) {
10911 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010912 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010913 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010914 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010915}
10916
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010917PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010918 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010919\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010920Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010921and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010922
10923static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010924unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010925{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010926 int kind;
10927 void *data;
10928 Py_ssize_t len, i;
10929
10930 if (PyUnicode_READY(self) == -1)
10931 return NULL;
10932
10933 kind = PyUnicode_KIND(self);
10934 data = PyUnicode_DATA(self);
10935 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010936
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010937 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010938 if (len == 1) {
10939 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10940 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10941 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010942
10943 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010944 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010945 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010946
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010947 for (i = 0; i < len; i++) {
10948 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010949 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010950 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010951 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010952 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010953}
10954
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010955PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010956 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010957\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010958Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010959False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010960
10961static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010962unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010963{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010964 Py_ssize_t i, length;
10965 int kind;
10966 void *data;
10967
10968 if (PyUnicode_READY(self) == -1)
10969 return NULL;
10970 length = PyUnicode_GET_LENGTH(self);
10971 kind = PyUnicode_KIND(self);
10972 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010973
Guido van Rossumd57fd912000-03-10 22:53:23 +000010974 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010975 if (length == 1)
10976 return PyBool_FromLong(
10977 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010978
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010979 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010980 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010981 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010982
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010983 for (i = 0; i < length; i++) {
10984 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010985 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010986 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010987 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010988}
10989
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010990PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010991 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010992\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010993Return True if all characters in S are digits\n\
10994and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010995
10996static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010997unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010998{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010999 Py_ssize_t i, length;
11000 int kind;
11001 void *data;
11002
11003 if (PyUnicode_READY(self) == -1)
11004 return NULL;
11005 length = PyUnicode_GET_LENGTH(self);
11006 kind = PyUnicode_KIND(self);
11007 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011008
Guido van Rossumd57fd912000-03-10 22:53:23 +000011009 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011010 if (length == 1) {
11011 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11012 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11013 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011014
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011015 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011016 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011017 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011018
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011019 for (i = 0; i < length; i++) {
11020 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011021 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011022 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011023 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011024}
11025
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011026PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011027 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011028\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011029Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011030False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011031
11032static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011033unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011034{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011035 Py_ssize_t i, length;
11036 int kind;
11037 void *data;
11038
11039 if (PyUnicode_READY(self) == -1)
11040 return NULL;
11041 length = PyUnicode_GET_LENGTH(self);
11042 kind = PyUnicode_KIND(self);
11043 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011044
Guido van Rossumd57fd912000-03-10 22:53:23 +000011045 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011046 if (length == 1)
11047 return PyBool_FromLong(
11048 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011049
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011050 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011051 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011052 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011053
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011054 for (i = 0; i < length; i++) {
11055 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011056 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011057 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011058 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011059}
11060
Martin v. Löwis47383402007-08-15 07:32:56 +000011061int
11062PyUnicode_IsIdentifier(PyObject *self)
11063{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011064 int kind;
11065 void *data;
11066 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011067 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011068
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011069 if (PyUnicode_READY(self) == -1) {
11070 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011071 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011072 }
11073
11074 /* Special case for empty strings */
11075 if (PyUnicode_GET_LENGTH(self) == 0)
11076 return 0;
11077 kind = PyUnicode_KIND(self);
11078 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011079
11080 /* PEP 3131 says that the first character must be in
11081 XID_Start and subsequent characters in XID_Continue,
11082 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011083 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011084 letters, digits, underscore). However, given the current
11085 definition of XID_Start and XID_Continue, it is sufficient
11086 to check just for these, except that _ must be allowed
11087 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011088 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011089 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011090 return 0;
11091
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011092 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011093 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011094 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011095 return 1;
11096}
11097
11098PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011099 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011100\n\
11101Return True if S is a valid identifier according\n\
11102to the language definition.");
11103
11104static PyObject*
11105unicode_isidentifier(PyObject *self)
11106{
11107 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11108}
11109
Georg Brandl559e5d72008-06-11 18:37:52 +000011110PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011111 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011112\n\
11113Return True if all characters in S are considered\n\
11114printable in repr() or S is empty, False otherwise.");
11115
11116static PyObject*
11117unicode_isprintable(PyObject *self)
11118{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011119 Py_ssize_t i, length;
11120 int kind;
11121 void *data;
11122
11123 if (PyUnicode_READY(self) == -1)
11124 return NULL;
11125 length = PyUnicode_GET_LENGTH(self);
11126 kind = PyUnicode_KIND(self);
11127 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011128
11129 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011130 if (length == 1)
11131 return PyBool_FromLong(
11132 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011133
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011134 for (i = 0; i < length; i++) {
11135 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011136 Py_RETURN_FALSE;
11137 }
11138 }
11139 Py_RETURN_TRUE;
11140}
11141
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011142PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011143 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011144\n\
11145Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011146iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011147
11148static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011149unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011150{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011151 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011152}
11153
Martin v. Löwis18e16552006-02-15 17:27:45 +000011154static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000011155unicode_length(PyUnicodeObject *self)
11156{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011157 if (PyUnicode_READY(self) == -1)
11158 return -1;
11159 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011160}
11161
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011162PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011163 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011164\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011165Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011166done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011167
11168static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011169unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011170{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011171 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011172 Py_UCS4 fillchar = ' ';
11173
11174 if (PyUnicode_READY(self) == -1)
11175 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011176
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011177 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011178 return NULL;
11179
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011180 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011181 Py_INCREF(self);
11182 return (PyObject*) self;
11183 }
11184
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011185 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011186}
11187
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011188PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011189 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011190\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011191Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011192
11193static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011194unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011195{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011196 return fixup(self, fixlower);
11197}
11198
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011199#define LEFTSTRIP 0
11200#define RIGHTSTRIP 1
11201#define BOTHSTRIP 2
11202
11203/* Arrays indexed by above */
11204static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11205
11206#define STRIPNAME(i) (stripformat[i]+3)
11207
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011208/* externally visible for str.strip(unicode) */
11209PyObject *
11210_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
11211{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011212 void *data;
11213 int kind;
11214 Py_ssize_t i, j, len;
11215 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011216
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011217 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11218 return NULL;
11219
11220 kind = PyUnicode_KIND(self);
11221 data = PyUnicode_DATA(self);
11222 len = PyUnicode_GET_LENGTH(self);
11223 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11224 PyUnicode_DATA(sepobj),
11225 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011226
Benjamin Peterson14339b62009-01-31 16:36:08 +000011227 i = 0;
11228 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011229 while (i < len &&
11230 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011231 i++;
11232 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011233 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011234
Benjamin Peterson14339b62009-01-31 16:36:08 +000011235 j = len;
11236 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011237 do {
11238 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011239 } while (j >= i &&
11240 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011241 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011242 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011243
Victor Stinner12bab6d2011-10-01 01:53:49 +020011244 return PyUnicode_Substring((PyObject*)self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011245}
11246
11247PyObject*
11248PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11249{
11250 unsigned char *data;
11251 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011252 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011253
Victor Stinnerde636f32011-10-01 03:55:54 +020011254 if (PyUnicode_READY(self) == -1)
11255 return NULL;
11256
11257 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11258
Victor Stinner12bab6d2011-10-01 01:53:49 +020011259 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011260 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011261 if (PyUnicode_CheckExact(self)) {
11262 Py_INCREF(self);
11263 return self;
11264 }
11265 else
11266 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011267 }
11268
Victor Stinner12bab6d2011-10-01 01:53:49 +020011269 length = end - start;
11270 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011271 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011272
Victor Stinnerde636f32011-10-01 03:55:54 +020011273 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011274 PyErr_SetString(PyExc_IndexError, "string index out of range");
11275 return NULL;
11276 }
11277
Victor Stinnerb9275c12011-10-05 14:01:42 +020011278 if (PyUnicode_IS_ASCII(self)) {
11279 kind = PyUnicode_KIND(self);
11280 data = PyUnicode_1BYTE_DATA(self);
11281 return unicode_fromascii(data + start, length);
11282 }
11283 else {
11284 kind = PyUnicode_KIND(self);
11285 data = PyUnicode_1BYTE_DATA(self);
11286 return PyUnicode_FromKindAndData(kind,
11287 data + PyUnicode_KIND_SIZE(kind, start),
11288 length);
11289 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011290}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011291
11292static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011293do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011294{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011295 int kind;
11296 void *data;
11297 Py_ssize_t len, i, j;
11298
11299 if (PyUnicode_READY(self) == -1)
11300 return NULL;
11301
11302 kind = PyUnicode_KIND(self);
11303 data = PyUnicode_DATA(self);
11304 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011305
Benjamin Peterson14339b62009-01-31 16:36:08 +000011306 i = 0;
11307 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011308 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011309 i++;
11310 }
11311 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011312
Benjamin Peterson14339b62009-01-31 16:36:08 +000011313 j = len;
11314 if (striptype != LEFTSTRIP) {
11315 do {
11316 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011317 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011318 j++;
11319 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011320
Victor Stinner12bab6d2011-10-01 01:53:49 +020011321 return PyUnicode_Substring((PyObject*)self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011322}
11323
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011324
11325static PyObject *
11326do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
11327{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011328 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011329
Benjamin Peterson14339b62009-01-31 16:36:08 +000011330 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11331 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011332
Benjamin Peterson14339b62009-01-31 16:36:08 +000011333 if (sep != NULL && sep != Py_None) {
11334 if (PyUnicode_Check(sep))
11335 return _PyUnicode_XStrip(self, striptype, sep);
11336 else {
11337 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011338 "%s arg must be None or str",
11339 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011340 return NULL;
11341 }
11342 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011343
Benjamin Peterson14339b62009-01-31 16:36:08 +000011344 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011345}
11346
11347
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011348PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011349 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011350\n\
11351Return a copy of the string S with leading and trailing\n\
11352whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011353If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011354
11355static PyObject *
11356unicode_strip(PyUnicodeObject *self, PyObject *args)
11357{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011358 if (PyTuple_GET_SIZE(args) == 0)
11359 return do_strip(self, BOTHSTRIP); /* Common case */
11360 else
11361 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011362}
11363
11364
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011365PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011366 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011367\n\
11368Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011369If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011370
11371static PyObject *
11372unicode_lstrip(PyUnicodeObject *self, PyObject *args)
11373{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011374 if (PyTuple_GET_SIZE(args) == 0)
11375 return do_strip(self, LEFTSTRIP); /* Common case */
11376 else
11377 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011378}
11379
11380
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011381PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011382 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011383\n\
11384Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011385If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011386
11387static PyObject *
11388unicode_rstrip(PyUnicodeObject *self, PyObject *args)
11389{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011390 if (PyTuple_GET_SIZE(args) == 0)
11391 return do_strip(self, RIGHTSTRIP); /* Common case */
11392 else
11393 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011394}
11395
11396
Guido van Rossumd57fd912000-03-10 22:53:23 +000011397static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000011398unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011399{
11400 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011401 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011402
Georg Brandl222de0f2009-04-12 12:01:50 +000011403 if (len < 1) {
11404 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011405 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011406 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011407
Tim Peters7a29bd52001-09-12 03:03:31 +000011408 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011409 /* no repeat, return original string */
11410 Py_INCREF(str);
11411 return (PyObject*) str;
11412 }
Tim Peters8f422462000-09-09 06:13:41 +000011413
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011414 if (PyUnicode_READY(str) == -1)
11415 return NULL;
11416
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011417 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011418 PyErr_SetString(PyExc_OverflowError,
11419 "repeated string is too long");
11420 return NULL;
11421 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011422 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011423
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011424 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011425 if (!u)
11426 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011427 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011428
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011429 if (PyUnicode_GET_LENGTH(str) == 1) {
11430 const int kind = PyUnicode_KIND(str);
11431 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11432 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011433 if (kind == PyUnicode_1BYTE_KIND)
11434 memset(to, (unsigned char)fill_char, len);
11435 else {
11436 for (n = 0; n < len; ++n)
11437 PyUnicode_WRITE(kind, to, n, fill_char);
11438 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011439 }
11440 else {
11441 /* number of characters copied this far */
11442 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
11443 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
11444 char *to = (char *) PyUnicode_DATA(u);
11445 Py_MEMCPY(to, PyUnicode_DATA(str),
11446 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011447 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011448 n = (done <= nchars-done) ? done : nchars-done;
11449 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011450 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011451 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011452 }
11453
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011454 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011455 return (PyObject*) u;
11456}
11457
Alexander Belopolsky40018472011-02-26 01:02:56 +000011458PyObject *
11459PyUnicode_Replace(PyObject *obj,
11460 PyObject *subobj,
11461 PyObject *replobj,
11462 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011463{
11464 PyObject *self;
11465 PyObject *str1;
11466 PyObject *str2;
11467 PyObject *result;
11468
11469 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011470 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011471 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011472 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011473 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011474 Py_DECREF(self);
11475 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011476 }
11477 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011478 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011479 Py_DECREF(self);
11480 Py_DECREF(str1);
11481 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011482 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011483 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011484 Py_DECREF(self);
11485 Py_DECREF(str1);
11486 Py_DECREF(str2);
11487 return result;
11488}
11489
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011490PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011491 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011492\n\
11493Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011494old replaced by new. If the optional argument count is\n\
11495given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011496
11497static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011498unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011499{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011500 PyObject *str1;
11501 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011502 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011503 PyObject *result;
11504
Martin v. Löwis18e16552006-02-15 17:27:45 +000011505 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011506 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011507 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011508 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011509 str1 = PyUnicode_FromObject(str1);
11510 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11511 return NULL;
11512 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011513 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011514 Py_DECREF(str1);
11515 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011516 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011517
11518 result = replace(self, str1, str2, maxcount);
11519
11520 Py_DECREF(str1);
11521 Py_DECREF(str2);
11522 return result;
11523}
11524
Alexander Belopolsky40018472011-02-26 01:02:56 +000011525static PyObject *
11526unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011527{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011528 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011529 Py_ssize_t isize;
11530 Py_ssize_t osize, squote, dquote, i, o;
11531 Py_UCS4 max, quote;
11532 int ikind, okind;
11533 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011534
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011535 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011536 return NULL;
11537
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011538 isize = PyUnicode_GET_LENGTH(unicode);
11539 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011540
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011541 /* Compute length of output, quote characters, and
11542 maximum character */
11543 osize = 2; /* quotes */
11544 max = 127;
11545 squote = dquote = 0;
11546 ikind = PyUnicode_KIND(unicode);
11547 for (i = 0; i < isize; i++) {
11548 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11549 switch (ch) {
11550 case '\'': squote++; osize++; break;
11551 case '"': dquote++; osize++; break;
11552 case '\\': case '\t': case '\r': case '\n':
11553 osize += 2; break;
11554 default:
11555 /* Fast-path ASCII */
11556 if (ch < ' ' || ch == 0x7f)
11557 osize += 4; /* \xHH */
11558 else if (ch < 0x7f)
11559 osize++;
11560 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11561 osize++;
11562 max = ch > max ? ch : max;
11563 }
11564 else if (ch < 0x100)
11565 osize += 4; /* \xHH */
11566 else if (ch < 0x10000)
11567 osize += 6; /* \uHHHH */
11568 else
11569 osize += 10; /* \uHHHHHHHH */
11570 }
11571 }
11572
11573 quote = '\'';
11574 if (squote) {
11575 if (dquote)
11576 /* Both squote and dquote present. Use squote,
11577 and escape them */
11578 osize += squote;
11579 else
11580 quote = '"';
11581 }
11582
11583 repr = PyUnicode_New(osize, max);
11584 if (repr == NULL)
11585 return NULL;
11586 okind = PyUnicode_KIND(repr);
11587 odata = PyUnicode_DATA(repr);
11588
11589 PyUnicode_WRITE(okind, odata, 0, quote);
11590 PyUnicode_WRITE(okind, odata, osize-1, quote);
11591
11592 for (i = 0, o = 1; i < isize; i++) {
11593 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011594
11595 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011596 if ((ch == quote) || (ch == '\\')) {
11597 PyUnicode_WRITE(okind, odata, o++, '\\');
11598 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011599 continue;
11600 }
11601
Benjamin Peterson29060642009-01-31 22:14:21 +000011602 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011603 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011604 PyUnicode_WRITE(okind, odata, o++, '\\');
11605 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011606 }
11607 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011608 PyUnicode_WRITE(okind, odata, o++, '\\');
11609 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011610 }
11611 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011612 PyUnicode_WRITE(okind, odata, o++, '\\');
11613 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011614 }
11615
11616 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011617 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011618 PyUnicode_WRITE(okind, odata, o++, '\\');
11619 PyUnicode_WRITE(okind, odata, o++, 'x');
11620 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11621 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011622 }
11623
Georg Brandl559e5d72008-06-11 18:37:52 +000011624 /* Copy ASCII characters as-is */
11625 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011626 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011627 }
11628
Benjamin Peterson29060642009-01-31 22:14:21 +000011629 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011630 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011631 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011632 (categories Z* and C* except ASCII space)
11633 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011634 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011635 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011636 if (ch <= 0xff) {
11637 PyUnicode_WRITE(okind, odata, o++, '\\');
11638 PyUnicode_WRITE(okind, odata, o++, 'x');
11639 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11640 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011641 }
11642 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011643 else if (ch >= 0x10000) {
11644 PyUnicode_WRITE(okind, odata, o++, '\\');
11645 PyUnicode_WRITE(okind, odata, o++, 'U');
11646 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
11647 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
11648 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
11649 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
11650 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11651 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11652 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11653 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011654 }
11655 /* Map 16-bit characters to '\uxxxx' */
11656 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011657 PyUnicode_WRITE(okind, odata, o++, '\\');
11658 PyUnicode_WRITE(okind, odata, o++, 'u');
11659 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11660 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11661 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11662 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011663 }
11664 }
11665 /* Copy characters as-is */
11666 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011667 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011668 }
11669 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000011670 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011671 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020011672 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000011673 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011674}
11675
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011676PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011677 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011678\n\
11679Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011680such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011681arguments start and end are interpreted as in slice notation.\n\
11682\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011683Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011684
11685static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011686unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011687{
Jesus Ceaac451502011-04-20 17:09:23 +020011688 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011689 Py_ssize_t start;
11690 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011691 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011692
Jesus Ceaac451502011-04-20 17:09:23 +020011693 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11694 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011695 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011696
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011697 if (PyUnicode_READY(self) == -1)
11698 return NULL;
11699 if (PyUnicode_READY(substring) == -1)
11700 return NULL;
11701
11702 result = any_find_slice(
Victor Stinnerc3cec782011-10-05 21:24:08 +020011703 asciilib_rfind_slice, ucs1lib_rfind_slice,
11704 ucs2lib_rfind_slice, ucs4lib_rfind_slice,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011705 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011706 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011707
11708 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011709
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011710 if (result == -2)
11711 return NULL;
11712
Christian Heimes217cfd12007-12-02 14:31:20 +000011713 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011714}
11715
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011716PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011717 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011718\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011719Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011720
11721static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011722unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011723{
Jesus Ceaac451502011-04-20 17:09:23 +020011724 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011725 Py_ssize_t start;
11726 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011727 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011728
Jesus Ceaac451502011-04-20 17:09:23 +020011729 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
11730 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011731 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011732
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011733 if (PyUnicode_READY(self) == -1)
11734 return NULL;
11735 if (PyUnicode_READY(substring) == -1)
11736 return NULL;
11737
11738 result = any_find_slice(
Victor Stinnerc3cec782011-10-05 21:24:08 +020011739 asciilib_rfind_slice, ucs1lib_rfind_slice,
11740 ucs2lib_rfind_slice, ucs4lib_rfind_slice,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011741 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011742 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011743
11744 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011745
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011746 if (result == -2)
11747 return NULL;
11748
Guido van Rossumd57fd912000-03-10 22:53:23 +000011749 if (result < 0) {
11750 PyErr_SetString(PyExc_ValueError, "substring not found");
11751 return NULL;
11752 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011753
Christian Heimes217cfd12007-12-02 14:31:20 +000011754 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011755}
11756
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011757PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011758 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011759\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011760Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011761done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011762
11763static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011764unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011765{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011766 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011767 Py_UCS4 fillchar = ' ';
11768
Victor Stinnere9a29352011-10-01 02:14:59 +020011769 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011770 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011771
Victor Stinnere9a29352011-10-01 02:14:59 +020011772 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011773 return NULL;
11774
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011775 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011776 Py_INCREF(self);
11777 return (PyObject*) self;
11778 }
11779
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011780 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011781}
11782
Alexander Belopolsky40018472011-02-26 01:02:56 +000011783PyObject *
11784PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011785{
11786 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000011787
Guido van Rossumd57fd912000-03-10 22:53:23 +000011788 s = PyUnicode_FromObject(s);
11789 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011790 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011791 if (sep != NULL) {
11792 sep = PyUnicode_FromObject(sep);
11793 if (sep == NULL) {
11794 Py_DECREF(s);
11795 return NULL;
11796 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011797 }
11798
Victor Stinner9310abb2011-10-05 00:59:23 +020011799 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011800
11801 Py_DECREF(s);
11802 Py_XDECREF(sep);
11803 return result;
11804}
11805
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011806PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011807 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011808\n\
11809Return a list of the words in S, using sep as the\n\
11810delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000011811splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000011812whitespace string is a separator and empty strings are\n\
11813removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011814
11815static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011816unicode_split(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011817{
11818 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011819 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011820
Martin v. Löwis18e16552006-02-15 17:27:45 +000011821 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011822 return NULL;
11823
11824 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011825 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011826 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020011827 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011828 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011829 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011830}
11831
Thomas Wouters477c8d52006-05-27 19:21:47 +000011832PyObject *
11833PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11834{
11835 PyObject* str_obj;
11836 PyObject* sep_obj;
11837 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011838 int kind1, kind2, kind;
11839 void *buf1 = NULL, *buf2 = NULL;
11840 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011841
11842 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020011843 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011844 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011845 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011846 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011847 Py_DECREF(str_obj);
11848 return NULL;
11849 }
11850
Victor Stinner14f8f022011-10-05 20:58:25 +020011851 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011852 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020011853 kind = Py_MAX(kind1, kind2);
11854 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011855 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020011856 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011857 if (!buf1)
11858 goto onError;
11859 buf2 = PyUnicode_DATA(sep_obj);
11860 if (kind2 != kind)
11861 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11862 if (!buf2)
11863 goto onError;
11864 len1 = PyUnicode_GET_LENGTH(str_obj);
11865 len2 = PyUnicode_GET_LENGTH(sep_obj);
11866
Victor Stinner14f8f022011-10-05 20:58:25 +020011867 switch(PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011868 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020011869 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
11870 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11871 else
11872 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011873 break;
11874 case PyUnicode_2BYTE_KIND:
11875 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11876 break;
11877 case PyUnicode_4BYTE_KIND:
11878 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11879 break;
11880 default:
11881 assert(0);
11882 out = 0;
11883 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011884
11885 Py_DECREF(sep_obj);
11886 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011887 if (kind1 != kind)
11888 PyMem_Free(buf1);
11889 if (kind2 != kind)
11890 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011891
11892 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011893 onError:
11894 Py_DECREF(sep_obj);
11895 Py_DECREF(str_obj);
11896 if (kind1 != kind && buf1)
11897 PyMem_Free(buf1);
11898 if (kind2 != kind && buf2)
11899 PyMem_Free(buf2);
11900 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011901}
11902
11903
11904PyObject *
11905PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11906{
11907 PyObject* str_obj;
11908 PyObject* sep_obj;
11909 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011910 int kind1, kind2, kind;
11911 void *buf1 = NULL, *buf2 = NULL;
11912 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011913
11914 str_obj = PyUnicode_FromObject(str_in);
11915 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011916 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011917 sep_obj = PyUnicode_FromObject(sep_in);
11918 if (!sep_obj) {
11919 Py_DECREF(str_obj);
11920 return NULL;
11921 }
11922
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011923 kind1 = PyUnicode_KIND(str_in);
11924 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011925 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011926 buf1 = PyUnicode_DATA(str_in);
11927 if (kind1 != kind)
11928 buf1 = _PyUnicode_AsKind(str_in, kind);
11929 if (!buf1)
11930 goto onError;
11931 buf2 = PyUnicode_DATA(sep_obj);
11932 if (kind2 != kind)
11933 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11934 if (!buf2)
11935 goto onError;
11936 len1 = PyUnicode_GET_LENGTH(str_obj);
11937 len2 = PyUnicode_GET_LENGTH(sep_obj);
11938
11939 switch(PyUnicode_KIND(str_in)) {
11940 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020011941 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
11942 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11943 else
11944 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011945 break;
11946 case PyUnicode_2BYTE_KIND:
11947 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11948 break;
11949 case PyUnicode_4BYTE_KIND:
11950 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11951 break;
11952 default:
11953 assert(0);
11954 out = 0;
11955 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011956
11957 Py_DECREF(sep_obj);
11958 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011959 if (kind1 != kind)
11960 PyMem_Free(buf1);
11961 if (kind2 != kind)
11962 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011963
11964 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011965 onError:
11966 Py_DECREF(sep_obj);
11967 Py_DECREF(str_obj);
11968 if (kind1 != kind && buf1)
11969 PyMem_Free(buf1);
11970 if (kind2 != kind && buf2)
11971 PyMem_Free(buf2);
11972 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011973}
11974
11975PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011976 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011977\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011978Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011979the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011980found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011981
11982static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011983unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011984{
Victor Stinner9310abb2011-10-05 00:59:23 +020011985 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011986}
11987
11988PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000011989 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011990\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011991Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011992the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011993separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011994
11995static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011996unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011997{
Victor Stinner9310abb2011-10-05 00:59:23 +020011998 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011999}
12000
Alexander Belopolsky40018472011-02-26 01:02:56 +000012001PyObject *
12002PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012003{
12004 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012005
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012006 s = PyUnicode_FromObject(s);
12007 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012008 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012009 if (sep != NULL) {
12010 sep = PyUnicode_FromObject(sep);
12011 if (sep == NULL) {
12012 Py_DECREF(s);
12013 return NULL;
12014 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012015 }
12016
Victor Stinner9310abb2011-10-05 00:59:23 +020012017 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012018
12019 Py_DECREF(s);
12020 Py_XDECREF(sep);
12021 return result;
12022}
12023
12024PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012025 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012026\n\
12027Return a list of the words in S, using sep as the\n\
12028delimiter string, starting at the end of the string and\n\
12029working to the front. If maxsplit is given, at most maxsplit\n\
12030splits are done. If sep is not specified, any whitespace string\n\
12031is a separator.");
12032
12033static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012034unicode_rsplit(PyObject *self, PyObject *args)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012035{
12036 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012037 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012038
Martin v. Löwis18e16552006-02-15 17:27:45 +000012039 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012040 return NULL;
12041
12042 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012043 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012044 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012045 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012046 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012047 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012048}
12049
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012050PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012051 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012052\n\
12053Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012054Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012055is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012056
12057static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012058unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012059{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012060 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012061 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012062
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012063 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12064 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012065 return NULL;
12066
Guido van Rossum86662912000-04-11 15:38:46 +000012067 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012068}
12069
12070static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012071PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012072{
Walter Dörwald346737f2007-05-31 10:44:43 +000012073 if (PyUnicode_CheckExact(self)) {
12074 Py_INCREF(self);
12075 return self;
12076 } else
12077 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020012078 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012079}
12080
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012081PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012082 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012083\n\
12084Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012085and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012086
12087static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012088unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012089{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012090 return fixup(self, fixswapcase);
12091}
12092
Georg Brandlceee0772007-11-27 23:48:05 +000012093PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012094 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012095\n\
12096Return a translation table usable for str.translate().\n\
12097If there is only one argument, it must be a dictionary mapping Unicode\n\
12098ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012099Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012100If there are two arguments, they must be strings of equal length, and\n\
12101in the resulting dictionary, each character in x will be mapped to the\n\
12102character at the same position in y. If there is a third argument, it\n\
12103must be a string, whose characters will be mapped to None in the result.");
12104
12105static PyObject*
12106unicode_maketrans(PyUnicodeObject *null, PyObject *args)
12107{
12108 PyObject *x, *y = NULL, *z = NULL;
12109 PyObject *new = NULL, *key, *value;
12110 Py_ssize_t i = 0;
12111 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012112
Georg Brandlceee0772007-11-27 23:48:05 +000012113 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12114 return NULL;
12115 new = PyDict_New();
12116 if (!new)
12117 return NULL;
12118 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012119 int x_kind, y_kind, z_kind;
12120 void *x_data, *y_data, *z_data;
12121
Georg Brandlceee0772007-11-27 23:48:05 +000012122 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012123 if (!PyUnicode_Check(x)) {
12124 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12125 "be a string if there is a second argument");
12126 goto err;
12127 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012128 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012129 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12130 "arguments must have equal length");
12131 goto err;
12132 }
12133 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012134 x_kind = PyUnicode_KIND(x);
12135 y_kind = PyUnicode_KIND(y);
12136 x_data = PyUnicode_DATA(x);
12137 y_data = PyUnicode_DATA(y);
12138 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12139 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12140 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012141 if (!key || !value)
12142 goto err;
12143 res = PyDict_SetItem(new, key, value);
12144 Py_DECREF(key);
12145 Py_DECREF(value);
12146 if (res < 0)
12147 goto err;
12148 }
12149 /* create entries for deleting chars in z */
12150 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012151 z_kind = PyUnicode_KIND(z);
12152 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000012153 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012154 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012155 if (!key)
12156 goto err;
12157 res = PyDict_SetItem(new, key, Py_None);
12158 Py_DECREF(key);
12159 if (res < 0)
12160 goto err;
12161 }
12162 }
12163 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012164 int kind;
12165 void *data;
12166
Georg Brandlceee0772007-11-27 23:48:05 +000012167 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012168 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012169 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12170 "to maketrans it must be a dict");
12171 goto err;
12172 }
12173 /* copy entries into the new dict, converting string keys to int keys */
12174 while (PyDict_Next(x, &i, &key, &value)) {
12175 if (PyUnicode_Check(key)) {
12176 /* convert string keys to integer keys */
12177 PyObject *newkey;
12178 if (PyUnicode_GET_SIZE(key) != 1) {
12179 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12180 "table must be of length 1");
12181 goto err;
12182 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012183 kind = PyUnicode_KIND(key);
12184 data = PyUnicode_DATA(key);
12185 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012186 if (!newkey)
12187 goto err;
12188 res = PyDict_SetItem(new, newkey, value);
12189 Py_DECREF(newkey);
12190 if (res < 0)
12191 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012192 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012193 /* just keep integer keys */
12194 if (PyDict_SetItem(new, key, value) < 0)
12195 goto err;
12196 } else {
12197 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12198 "be strings or integers");
12199 goto err;
12200 }
12201 }
12202 }
12203 return new;
12204 err:
12205 Py_DECREF(new);
12206 return NULL;
12207}
12208
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012209PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012210 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012211\n\
12212Return a copy of the string S, where all characters have been mapped\n\
12213through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012214Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012215Unmapped characters are left untouched. Characters mapped to None\n\
12216are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012217
12218static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012219unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012220{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012221 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012222}
12223
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012224PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012225 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012226\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012227Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012228
12229static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012230unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012231{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012232 return fixup(self, fixupper);
12233}
12234
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012235PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012236 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012237\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012238Pad a numeric string S with zeros on the left, to fill a field\n\
12239of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012240
12241static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012242unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012243{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012244 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012245 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012246 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012247 int kind;
12248 void *data;
12249 Py_UCS4 chr;
12250
12251 if (PyUnicode_READY(self) == -1)
12252 return NULL;
12253
Martin v. Löwis18e16552006-02-15 17:27:45 +000012254 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012255 return NULL;
12256
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012257 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012258 if (PyUnicode_CheckExact(self)) {
12259 Py_INCREF(self);
12260 return (PyObject*) self;
12261 }
12262 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020012263 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012264 }
12265
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012266 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012267
12268 u = pad(self, fill, 0, '0');
12269
Walter Dörwald068325e2002-04-15 13:36:47 +000012270 if (u == NULL)
12271 return NULL;
12272
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012273 kind = PyUnicode_KIND(u);
12274 data = PyUnicode_DATA(u);
12275 chr = PyUnicode_READ(kind, data, fill);
12276
12277 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012278 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012279 PyUnicode_WRITE(kind, data, 0, chr);
12280 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012281 }
12282
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012283 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012284 return (PyObject*) u;
12285}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012286
12287#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012288static PyObject *
12289unicode__decimal2ascii(PyObject *self)
12290{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012291 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012292}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012293#endif
12294
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012295PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012296 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012297\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012298Return True if S starts with the specified prefix, False otherwise.\n\
12299With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012300With optional end, stop comparing S at that position.\n\
12301prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012302
12303static PyObject *
12304unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012305 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012306{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012307 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012308 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012309 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012310 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012311 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012312
Jesus Ceaac451502011-04-20 17:09:23 +020012313 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012314 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012315 if (PyTuple_Check(subobj)) {
12316 Py_ssize_t i;
12317 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12318 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012319 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012320 if (substring == NULL)
12321 return NULL;
12322 result = tailmatch(self, substring, start, end, -1);
12323 Py_DECREF(substring);
12324 if (result) {
12325 Py_RETURN_TRUE;
12326 }
12327 }
12328 /* nothing matched */
12329 Py_RETURN_FALSE;
12330 }
12331 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012332 if (substring == NULL) {
12333 if (PyErr_ExceptionMatches(PyExc_TypeError))
12334 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12335 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012336 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012337 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012338 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012339 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012340 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012341}
12342
12343
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012344PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012345 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012346\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012347Return True if S ends with the specified suffix, False otherwise.\n\
12348With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012349With optional end, stop comparing S at that position.\n\
12350suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012351
12352static PyObject *
12353unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012354 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012355{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012356 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012357 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012358 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012359 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012360 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012361
Jesus Ceaac451502011-04-20 17:09:23 +020012362 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012363 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012364 if (PyTuple_Check(subobj)) {
12365 Py_ssize_t i;
12366 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12367 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012368 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012369 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012370 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012371 result = tailmatch(self, substring, start, end, +1);
12372 Py_DECREF(substring);
12373 if (result) {
12374 Py_RETURN_TRUE;
12375 }
12376 }
12377 Py_RETURN_FALSE;
12378 }
12379 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012380 if (substring == NULL) {
12381 if (PyErr_ExceptionMatches(PyExc_TypeError))
12382 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12383 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012384 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012385 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012386 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012387 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012388 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012389}
12390
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012391#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012392
12393PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012394 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012395\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012396Return a formatted version of S, using substitutions from args and kwargs.\n\
12397The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012398
Eric Smith27bbca62010-11-04 17:06:58 +000012399PyDoc_STRVAR(format_map__doc__,
12400 "S.format_map(mapping) -> str\n\
12401\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012402Return a formatted version of S, using substitutions from mapping.\n\
12403The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012404
Eric Smith4a7d76d2008-05-30 18:10:19 +000012405static PyObject *
12406unicode__format__(PyObject* self, PyObject* args)
12407{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012408 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012409
12410 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12411 return NULL;
12412
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012413 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012414 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012415 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012416}
12417
Eric Smith8c663262007-08-25 02:26:07 +000012418PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012419 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012420\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012421Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012422
12423static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012424unicode__sizeof__(PyUnicodeObject *v)
12425{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012426 Py_ssize_t size;
12427
12428 /* If it's a compact object, account for base structure +
12429 character data. */
12430 if (PyUnicode_IS_COMPACT_ASCII(v))
12431 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12432 else if (PyUnicode_IS_COMPACT(v))
12433 size = sizeof(PyCompactUnicodeObject) +
12434 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
12435 else {
12436 /* If it is a two-block object, account for base object, and
12437 for character block if present. */
12438 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012439 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012440 size += (PyUnicode_GET_LENGTH(v) + 1) *
12441 PyUnicode_CHARACTER_SIZE(v);
12442 }
12443 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012444 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012445 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012446 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012447 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012448 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012449
12450 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012451}
12452
12453PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012454 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012455
12456static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012457unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012458{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012459 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012460 if (!copy)
12461 return NULL;
12462 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012463}
12464
Guido van Rossumd57fd912000-03-10 22:53:23 +000012465static PyMethodDef unicode_methods[] = {
12466
12467 /* Order is according to common usage: often used methods should
12468 appear first, since lookup is done sequentially. */
12469
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012470 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012471 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12472 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012473 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012474 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12475 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12476 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12477 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12478 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12479 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12480 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012481 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012482 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12483 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12484 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012485 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012486 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12487 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12488 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012489 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012490 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012491 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012492 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012493 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12494 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12495 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12496 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12497 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12498 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12499 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12500 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12501 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12502 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12503 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12504 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12505 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12506 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012507 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012508 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012509 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012510 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012511 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012512 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012513 {"maketrans", (PyCFunction) unicode_maketrans,
12514 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012515 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012516#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012517 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012518#endif
12519
12520#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012521 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012522 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012523#endif
12524
Benjamin Peterson14339b62009-01-31 16:36:08 +000012525 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012526 {NULL, NULL}
12527};
12528
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012529static PyObject *
12530unicode_mod(PyObject *v, PyObject *w)
12531{
Brian Curtindfc80e32011-08-10 20:28:54 -050012532 if (!PyUnicode_Check(v))
12533 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012534 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012535}
12536
12537static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012538 0, /*nb_add*/
12539 0, /*nb_subtract*/
12540 0, /*nb_multiply*/
12541 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012542};
12543
Guido van Rossumd57fd912000-03-10 22:53:23 +000012544static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012545 (lenfunc) unicode_length, /* sq_length */
12546 PyUnicode_Concat, /* sq_concat */
12547 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12548 (ssizeargfunc) unicode_getitem, /* sq_item */
12549 0, /* sq_slice */
12550 0, /* sq_ass_item */
12551 0, /* sq_ass_slice */
12552 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012553};
12554
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012555static PyObject*
12556unicode_subscript(PyUnicodeObject* self, PyObject* item)
12557{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012558 if (PyUnicode_READY(self) == -1)
12559 return NULL;
12560
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000012561 if (PyIndex_Check(item)) {
12562 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012563 if (i == -1 && PyErr_Occurred())
12564 return NULL;
12565 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012566 i += PyUnicode_GET_LENGTH(self);
Victor Stinner2fe5ced2011-10-02 00:25:40 +020012567 return unicode_getitem((PyObject*)self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012568 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000012569 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012570 PyObject *result;
12571 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012572 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012573 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012574
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012575 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000012576 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012577 return NULL;
12578 }
12579
12580 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012581 return PyUnicode_New(0, 0);
12582 } else if (start == 0 && step == 1 &&
12583 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000012584 PyUnicode_CheckExact(self)) {
12585 Py_INCREF(self);
12586 return (PyObject *)self;
12587 } else if (step == 1) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012588 return PyUnicode_Substring((PyObject*)self,
12589 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012590 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012591 /* General case */
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012592 max_char = 0;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012593 src_kind = PyUnicode_KIND(self);
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012594 kind_limit = kind_maxchar_limit(src_kind);
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012595 src_data = PyUnicode_DATA(self);
12596 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12597 ch = PyUnicode_READ(src_kind, src_data, cur);
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012598 if (ch > max_char) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012599 max_char = ch;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012600 if (max_char >= kind_limit)
12601 break;
12602 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012603 }
12604 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012605 if (result == NULL)
12606 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012607 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012608 dest_data = PyUnicode_DATA(result);
12609
12610 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012611 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
12612 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012613 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012614 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012615 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012616 } else {
12617 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12618 return NULL;
12619 }
12620}
12621
12622static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012623 (lenfunc)unicode_length, /* mp_length */
12624 (binaryfunc)unicode_subscript, /* mp_subscript */
12625 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012626};
12627
Guido van Rossumd57fd912000-03-10 22:53:23 +000012628
Guido van Rossumd57fd912000-03-10 22:53:23 +000012629/* Helpers for PyUnicode_Format() */
12630
12631static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000012632getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012633{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012634 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012635 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012636 (*p_argidx)++;
12637 if (arglen < 0)
12638 return args;
12639 else
12640 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012641 }
12642 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012643 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012644 return NULL;
12645}
12646
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012647/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012648
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012649static PyObject *
12650formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012651{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012652 char *p;
12653 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012654 double x;
Tim Petersced69f82003-09-16 20:30:58 +000012655
Guido van Rossumd57fd912000-03-10 22:53:23 +000012656 x = PyFloat_AsDouble(v);
12657 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012658 return NULL;
12659
Guido van Rossumd57fd912000-03-10 22:53:23 +000012660 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012661 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000012662
Eric Smith0923d1d2009-04-16 20:16:10 +000012663 p = PyOS_double_to_string(x, type, prec,
12664 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012665 if (p == NULL)
12666 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012667 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000012668 PyMem_Free(p);
12669 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012670}
12671
Tim Peters38fd5b62000-09-21 05:43:11 +000012672static PyObject*
12673formatlong(PyObject *val, int flags, int prec, int type)
12674{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012675 char *buf;
12676 int len;
12677 PyObject *str; /* temporary string object. */
12678 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012679
Benjamin Peterson14339b62009-01-31 16:36:08 +000012680 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
12681 if (!str)
12682 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012683 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012684 Py_DECREF(str);
12685 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012686}
12687
Guido van Rossumd57fd912000-03-10 22:53:23 +000012688static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012689formatchar(Py_UCS4 *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012690 size_t buflen,
12691 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012692{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012693 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012694 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012695 if (PyUnicode_GET_LENGTH(v) == 1) {
12696 buf[0] = PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000012697 buf[1] = '\0';
12698 return 1;
12699 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012700 goto onError;
12701 }
12702 else {
12703 /* Integer input truncated to a character */
12704 long x;
12705 x = PyLong_AsLong(v);
12706 if (x == -1 && PyErr_Occurred())
12707 goto onError;
12708
12709 if (x < 0 || x > 0x10ffff) {
12710 PyErr_SetString(PyExc_OverflowError,
12711 "%c arg not in range(0x110000)");
12712 return -1;
12713 }
12714
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012715 buf[0] = (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012716 buf[1] = '\0';
12717 return 1;
12718 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012719
Benjamin Peterson29060642009-01-31 22:14:21 +000012720 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012721 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012722 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012723 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012724}
12725
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012726/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012727 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012728*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012729#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012730
Alexander Belopolsky40018472011-02-26 01:02:56 +000012731PyObject *
12732PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012733{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012734 void *fmt;
12735 int fmtkind;
12736 PyObject *result;
12737 Py_UCS4 *res, *res0;
12738 Py_UCS4 max;
12739 int kind;
12740 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012741 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012742 PyObject *dict = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012743 PyUnicodeObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +000012744
Guido van Rossumd57fd912000-03-10 22:53:23 +000012745 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012746 PyErr_BadInternalCall();
12747 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012748 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012749 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
12750 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012751 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012752 fmt = PyUnicode_DATA(uformat);
12753 fmtkind = PyUnicode_KIND(uformat);
12754 fmtcnt = PyUnicode_GET_LENGTH(uformat);
12755 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012756
12757 reslen = rescnt = fmtcnt + 100;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012758 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
12759 if (res0 == NULL) {
12760 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012761 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012762 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012763
12764 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012765 arglen = PyTuple_Size(args);
12766 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012767 }
12768 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012769 arglen = -1;
12770 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012771 }
Christian Heimes90aa7642007-12-19 02:45:37 +000012772 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000012773 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000012774 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012775
12776 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012777 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012778 if (--rescnt < 0) {
12779 rescnt = fmtcnt + 100;
12780 reslen += rescnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012781 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12782 if (res0 == NULL){
12783 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012784 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012785 }
12786 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012787 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012788 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012789 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012790 }
12791 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012792 /* Got a format specifier */
12793 int flags = 0;
12794 Py_ssize_t width = -1;
12795 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012796 Py_UCS4 c = '\0';
12797 Py_UCS4 fill;
Benjamin Peterson29060642009-01-31 22:14:21 +000012798 int isnumok;
12799 PyObject *v = NULL;
12800 PyObject *temp = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012801 void *pbuf;
12802 Py_ssize_t pindex;
Benjamin Peterson29060642009-01-31 22:14:21 +000012803 Py_UNICODE sign;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012804 Py_ssize_t len, len1;
12805 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012806
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012807 fmtpos++;
12808 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12809 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000012810 Py_ssize_t keylen;
12811 PyObject *key;
12812 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000012813
Benjamin Peterson29060642009-01-31 22:14:21 +000012814 if (dict == NULL) {
12815 PyErr_SetString(PyExc_TypeError,
12816 "format requires a mapping");
12817 goto onError;
12818 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012819 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012820 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012821 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012822 /* Skip over balanced parentheses */
12823 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012824 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012825 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012826 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012827 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012828 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012829 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012830 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012831 if (fmtcnt < 0 || pcount > 0) {
12832 PyErr_SetString(PyExc_ValueError,
12833 "incomplete format key");
12834 goto onError;
12835 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020012836 key = PyUnicode_Substring((PyObject*)uformat,
12837 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012838 if (key == NULL)
12839 goto onError;
12840 if (args_owned) {
12841 Py_DECREF(args);
12842 args_owned = 0;
12843 }
12844 args = PyObject_GetItem(dict, key);
12845 Py_DECREF(key);
12846 if (args == NULL) {
12847 goto onError;
12848 }
12849 args_owned = 1;
12850 arglen = -1;
12851 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012852 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012853 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012854 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012855 case '-': flags |= F_LJUST; continue;
12856 case '+': flags |= F_SIGN; continue;
12857 case ' ': flags |= F_BLANK; continue;
12858 case '#': flags |= F_ALT; continue;
12859 case '0': flags |= F_ZERO; continue;
12860 }
12861 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012862 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012863 if (c == '*') {
12864 v = getnextarg(args, arglen, &argidx);
12865 if (v == NULL)
12866 goto onError;
12867 if (!PyLong_Check(v)) {
12868 PyErr_SetString(PyExc_TypeError,
12869 "* wants int");
12870 goto onError;
12871 }
12872 width = PyLong_AsLong(v);
12873 if (width == -1 && PyErr_Occurred())
12874 goto onError;
12875 if (width < 0) {
12876 flags |= F_LJUST;
12877 width = -width;
12878 }
12879 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012880 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012881 }
12882 else if (c >= '0' && c <= '9') {
12883 width = c - '0';
12884 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012885 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012886 if (c < '0' || c > '9')
12887 break;
12888 if ((width*10) / 10 != width) {
12889 PyErr_SetString(PyExc_ValueError,
12890 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012891 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012892 }
12893 width = width*10 + (c - '0');
12894 }
12895 }
12896 if (c == '.') {
12897 prec = 0;
12898 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012899 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012900 if (c == '*') {
12901 v = getnextarg(args, arglen, &argidx);
12902 if (v == NULL)
12903 goto onError;
12904 if (!PyLong_Check(v)) {
12905 PyErr_SetString(PyExc_TypeError,
12906 "* wants int");
12907 goto onError;
12908 }
12909 prec = PyLong_AsLong(v);
12910 if (prec == -1 && PyErr_Occurred())
12911 goto onError;
12912 if (prec < 0)
12913 prec = 0;
12914 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012915 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012916 }
12917 else if (c >= '0' && c <= '9') {
12918 prec = c - '0';
12919 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012920 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012921 if (c < '0' || c > '9')
12922 break;
12923 if ((prec*10) / 10 != prec) {
12924 PyErr_SetString(PyExc_ValueError,
12925 "prec too big");
12926 goto onError;
12927 }
12928 prec = prec*10 + (c - '0');
12929 }
12930 }
12931 } /* prec */
12932 if (fmtcnt >= 0) {
12933 if (c == 'h' || c == 'l' || c == 'L') {
12934 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012935 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012936 }
12937 }
12938 if (fmtcnt < 0) {
12939 PyErr_SetString(PyExc_ValueError,
12940 "incomplete format");
12941 goto onError;
12942 }
12943 if (c != '%') {
12944 v = getnextarg(args, arglen, &argidx);
12945 if (v == NULL)
12946 goto onError;
12947 }
12948 sign = 0;
12949 fill = ' ';
12950 switch (c) {
12951
12952 case '%':
12953 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012954 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012955 /* presume that buffer length is at least 1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012956 PyUnicode_WRITE(kind, pbuf, 0, '%');
Benjamin Peterson29060642009-01-31 22:14:21 +000012957 len = 1;
12958 break;
12959
12960 case 's':
12961 case 'r':
12962 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012963 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012964 temp = v;
12965 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012966 }
12967 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012968 if (c == 's')
12969 temp = PyObject_Str(v);
12970 else if (c == 'r')
12971 temp = PyObject_Repr(v);
12972 else
12973 temp = PyObject_ASCII(v);
12974 if (temp == NULL)
12975 goto onError;
12976 if (PyUnicode_Check(temp))
12977 /* nothing to do */;
12978 else {
12979 Py_DECREF(temp);
12980 PyErr_SetString(PyExc_TypeError,
12981 "%s argument has non-string str()");
12982 goto onError;
12983 }
12984 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012985 if (PyUnicode_READY(temp) == -1) {
12986 Py_CLEAR(temp);
12987 goto onError;
12988 }
12989 pbuf = PyUnicode_DATA(temp);
12990 kind = PyUnicode_KIND(temp);
12991 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012992 if (prec >= 0 && len > prec)
12993 len = prec;
12994 break;
12995
12996 case 'i':
12997 case 'd':
12998 case 'u':
12999 case 'o':
13000 case 'x':
13001 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000013002 isnumok = 0;
13003 if (PyNumber_Check(v)) {
13004 PyObject *iobj=NULL;
13005
13006 if (PyLong_Check(v)) {
13007 iobj = v;
13008 Py_INCREF(iobj);
13009 }
13010 else {
13011 iobj = PyNumber_Long(v);
13012 }
13013 if (iobj!=NULL) {
13014 if (PyLong_Check(iobj)) {
13015 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013016 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013017 Py_DECREF(iobj);
13018 if (!temp)
13019 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013020 if (PyUnicode_READY(temp) == -1) {
13021 Py_CLEAR(temp);
13022 goto onError;
13023 }
13024 pbuf = PyUnicode_DATA(temp);
13025 kind = PyUnicode_KIND(temp);
13026 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013027 sign = 1;
13028 }
13029 else {
13030 Py_DECREF(iobj);
13031 }
13032 }
13033 }
13034 if (!isnumok) {
13035 PyErr_Format(PyExc_TypeError,
13036 "%%%c format: a number is required, "
13037 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13038 goto onError;
13039 }
13040 if (flags & F_ZERO)
13041 fill = '0';
13042 break;
13043
13044 case 'e':
13045 case 'E':
13046 case 'f':
13047 case 'F':
13048 case 'g':
13049 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013050 temp = formatfloat(v, flags, prec, c);
13051 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000013052 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013053 if (PyUnicode_READY(temp) == -1) {
13054 Py_CLEAR(temp);
13055 goto onError;
13056 }
13057 pbuf = PyUnicode_DATA(temp);
13058 kind = PyUnicode_KIND(temp);
13059 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013060 sign = 1;
13061 if (flags & F_ZERO)
13062 fill = '0';
13063 break;
13064
13065 case 'c':
13066 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013067 kind = PyUnicode_4BYTE_KIND;
Victor Stinnerb9dcffb2011-09-29 00:39:24 +020013068 len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v);
Benjamin Peterson29060642009-01-31 22:14:21 +000013069 if (len < 0)
13070 goto onError;
13071 break;
13072
13073 default:
13074 PyErr_Format(PyExc_ValueError,
13075 "unsupported format character '%c' (0x%x) "
13076 "at index %zd",
13077 (31<=c && c<=126) ? (char)c : '?',
13078 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013079 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013080 goto onError;
13081 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013082 /* pbuf is initialized here. */
13083 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013084 if (sign) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013085 if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
13086 PyUnicode_READ(kind, pbuf, pindex) == '+') {
13087 sign = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013088 len--;
13089 }
13090 else if (flags & F_SIGN)
13091 sign = '+';
13092 else if (flags & F_BLANK)
13093 sign = ' ';
13094 else
13095 sign = 0;
13096 }
13097 if (width < len)
13098 width = len;
13099 if (rescnt - (sign != 0) < width) {
13100 reslen -= rescnt;
13101 rescnt = width + fmtcnt + 100;
13102 reslen += rescnt;
13103 if (reslen < 0) {
13104 Py_XDECREF(temp);
13105 PyErr_NoMemory();
13106 goto onError;
13107 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013108 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
13109 if (res0 == 0) {
13110 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000013111 Py_XDECREF(temp);
13112 goto onError;
13113 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013114 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000013115 }
13116 if (sign) {
13117 if (fill != ' ')
13118 *res++ = sign;
13119 rescnt--;
13120 if (width > len)
13121 width--;
13122 }
13123 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013124 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13125 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013126 if (fill != ' ') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013127 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
13128 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013129 }
13130 rescnt -= 2;
13131 width -= 2;
13132 if (width < 0)
13133 width = 0;
13134 len -= 2;
13135 }
13136 if (width > len && !(flags & F_LJUST)) {
13137 do {
13138 --rescnt;
13139 *res++ = fill;
13140 } while (--width > len);
13141 }
13142 if (fill == ' ') {
13143 if (sign)
13144 *res++ = sign;
13145 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013146 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13147 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
13148 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
13149 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013150 }
13151 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013152 /* Copy all characters, preserving len */
13153 len1 = len;
13154 while (len1--) {
13155 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
13156 rescnt--;
13157 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013158 while (--width >= len) {
13159 --rescnt;
13160 *res++ = ' ';
13161 }
13162 if (dict && (argidx < arglen) && c != '%') {
13163 PyErr_SetString(PyExc_TypeError,
13164 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +000013165 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013166 goto onError;
13167 }
13168 Py_XDECREF(temp);
13169 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013170 } /* until end */
13171 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013172 PyErr_SetString(PyExc_TypeError,
13173 "not all arguments converted during string formatting");
13174 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013175 }
13176
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013177
13178 for (max=0, res = res0; res < res0+reslen-rescnt; res++)
13179 if (*res > max)
13180 max = *res;
13181 result = PyUnicode_New(reslen - rescnt, max);
13182 if (!result)
Benjamin Peterson29060642009-01-31 22:14:21 +000013183 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013184 kind = PyUnicode_KIND(result);
13185 for (res = res0; res < res0+reslen-rescnt; res++)
13186 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
13187 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013188 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013189 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013190 }
13191 Py_DECREF(uformat);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013192 assert(_PyUnicode_CheckConsistency(result, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000013193 return (PyObject *)result;
13194
Benjamin Peterson29060642009-01-31 22:14:21 +000013195 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013196 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013197 Py_DECREF(uformat);
13198 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013199 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013200 }
13201 return NULL;
13202}
13203
Jeremy Hylton938ace62002-07-17 16:30:39 +000013204static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013205unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13206
Tim Peters6d6c1a32001-08-02 04:15:00 +000013207static PyObject *
13208unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13209{
Benjamin Peterson29060642009-01-31 22:14:21 +000013210 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013211 static char *kwlist[] = {"object", "encoding", "errors", 0};
13212 char *encoding = NULL;
13213 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013214
Benjamin Peterson14339b62009-01-31 16:36:08 +000013215 if (type != &PyUnicode_Type)
13216 return unicode_subtype_new(type, args, kwds);
13217 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013218 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013219 return NULL;
13220 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013221 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013222 if (encoding == NULL && errors == NULL)
13223 return PyObject_Str(x);
13224 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013225 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013226}
13227
Guido van Rossume023fe02001-08-30 03:12:59 +000013228static PyObject *
13229unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13230{
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013231 PyUnicodeObject *unicode, *self;
13232 Py_ssize_t length, char_size;
13233 int share_wstr, share_utf8;
13234 unsigned int kind;
13235 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013236
Benjamin Peterson14339b62009-01-31 16:36:08 +000013237 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013238
13239 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
13240 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013241 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013242 assert(_PyUnicode_CHECK(unicode));
Victor Stinnere06e1452011-10-04 20:52:31 +020013243 if (PyUnicode_READY(unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013244 return NULL;
13245
13246 self = (PyUnicodeObject *) type->tp_alloc(type, 0);
13247 if (self == NULL) {
13248 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013249 return NULL;
13250 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013251 kind = PyUnicode_KIND(unicode);
13252 length = PyUnicode_GET_LENGTH(unicode);
13253
13254 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013255#ifdef Py_DEBUG
13256 _PyUnicode_HASH(self) = -1;
13257#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013258 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013259#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013260 _PyUnicode_STATE(self).interned = 0;
13261 _PyUnicode_STATE(self).kind = kind;
13262 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013263 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013264 _PyUnicode_STATE(self).ready = 1;
13265 _PyUnicode_WSTR(self) = NULL;
13266 _PyUnicode_UTF8_LENGTH(self) = 0;
13267 _PyUnicode_UTF8(self) = NULL;
13268 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013269 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013270
13271 share_utf8 = 0;
13272 share_wstr = 0;
13273 if (kind == PyUnicode_1BYTE_KIND) {
13274 char_size = 1;
13275 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13276 share_utf8 = 1;
13277 }
13278 else if (kind == PyUnicode_2BYTE_KIND) {
13279 char_size = 2;
13280 if (sizeof(wchar_t) == 2)
13281 share_wstr = 1;
13282 }
13283 else {
13284 assert(kind == PyUnicode_4BYTE_KIND);
13285 char_size = 4;
13286 if (sizeof(wchar_t) == 4)
13287 share_wstr = 1;
13288 }
13289
13290 /* Ensure we won't overflow the length. */
13291 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13292 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013293 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013294 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013295 data = PyObject_MALLOC((length + 1) * char_size);
13296 if (data == NULL) {
13297 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013298 goto onError;
13299 }
13300
Victor Stinnerc3c74152011-10-02 20:39:55 +020013301 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013302 if (share_utf8) {
13303 _PyUnicode_UTF8_LENGTH(self) = length;
13304 _PyUnicode_UTF8(self) = data;
13305 }
13306 if (share_wstr) {
13307 _PyUnicode_WSTR_LENGTH(self) = length;
13308 _PyUnicode_WSTR(self) = (wchar_t *)data;
13309 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013310
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013311 Py_MEMCPY(data, PyUnicode_DATA(unicode),
13312 PyUnicode_KIND_SIZE(kind, length + 1));
13313 Py_DECREF(unicode);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013314 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013315#ifdef Py_DEBUG
13316 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13317#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013318 return (PyObject *)self;
13319
13320onError:
13321 Py_DECREF(unicode);
13322 Py_DECREF(self);
13323 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013324}
13325
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013326PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013327 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013328\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013329Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013330encoding defaults to the current default string encoding.\n\
13331errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013332
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013333static PyObject *unicode_iter(PyObject *seq);
13334
Guido van Rossumd57fd912000-03-10 22:53:23 +000013335PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013336 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013337 "str", /* tp_name */
13338 sizeof(PyUnicodeObject), /* tp_size */
13339 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013340 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013341 (destructor)unicode_dealloc, /* tp_dealloc */
13342 0, /* tp_print */
13343 0, /* tp_getattr */
13344 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013345 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013346 unicode_repr, /* tp_repr */
13347 &unicode_as_number, /* tp_as_number */
13348 &unicode_as_sequence, /* tp_as_sequence */
13349 &unicode_as_mapping, /* tp_as_mapping */
13350 (hashfunc) unicode_hash, /* tp_hash*/
13351 0, /* tp_call*/
13352 (reprfunc) unicode_str, /* tp_str */
13353 PyObject_GenericGetAttr, /* tp_getattro */
13354 0, /* tp_setattro */
13355 0, /* tp_as_buffer */
13356 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013357 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013358 unicode_doc, /* tp_doc */
13359 0, /* tp_traverse */
13360 0, /* tp_clear */
13361 PyUnicode_RichCompare, /* tp_richcompare */
13362 0, /* tp_weaklistoffset */
13363 unicode_iter, /* tp_iter */
13364 0, /* tp_iternext */
13365 unicode_methods, /* tp_methods */
13366 0, /* tp_members */
13367 0, /* tp_getset */
13368 &PyBaseObject_Type, /* tp_base */
13369 0, /* tp_dict */
13370 0, /* tp_descr_get */
13371 0, /* tp_descr_set */
13372 0, /* tp_dictoffset */
13373 0, /* tp_init */
13374 0, /* tp_alloc */
13375 unicode_new, /* tp_new */
13376 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013377};
13378
13379/* Initialize the Unicode implementation */
13380
Thomas Wouters78890102000-07-22 19:25:51 +000013381void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013382{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013383 int i;
13384
Thomas Wouters477c8d52006-05-27 19:21:47 +000013385 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013386 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013387 0x000A, /* LINE FEED */
13388 0x000D, /* CARRIAGE RETURN */
13389 0x001C, /* FILE SEPARATOR */
13390 0x001D, /* GROUP SEPARATOR */
13391 0x001E, /* RECORD SEPARATOR */
13392 0x0085, /* NEXT LINE */
13393 0x2028, /* LINE SEPARATOR */
13394 0x2029, /* PARAGRAPH SEPARATOR */
13395 };
13396
Fred Drakee4315f52000-05-09 19:53:39 +000013397 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013398 unicode_empty = PyUnicode_New(0, 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013399 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013400 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013401 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013402
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013403 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013404 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013405 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013406 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013407
13408 /* initialize the linebreak bloom filter */
13409 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013410 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013411 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013412
13413 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013414}
13415
13416/* Finalize the Unicode implementation */
13417
Christian Heimesa156e092008-02-16 07:38:31 +000013418int
13419PyUnicode_ClearFreeList(void)
13420{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013421 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013422}
13423
Guido van Rossumd57fd912000-03-10 22:53:23 +000013424void
Thomas Wouters78890102000-07-22 19:25:51 +000013425_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013426{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013427 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013428
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013429 Py_XDECREF(unicode_empty);
13430 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013431
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013432 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013433 if (unicode_latin1[i]) {
13434 Py_DECREF(unicode_latin1[i]);
13435 unicode_latin1[i] = NULL;
13436 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013437 }
Christian Heimesa156e092008-02-16 07:38:31 +000013438 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013439}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013440
Walter Dörwald16807132007-05-25 13:52:07 +000013441void
13442PyUnicode_InternInPlace(PyObject **p)
13443{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013444 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
13445 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013446#ifdef Py_DEBUG
13447 assert(s != NULL);
13448 assert(_PyUnicode_CHECK(s));
13449#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013450 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013451 return;
13452#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013453 /* If it's a subclass, we don't really know what putting
13454 it in the interned dict might do. */
13455 if (!PyUnicode_CheckExact(s))
13456 return;
13457 if (PyUnicode_CHECK_INTERNED(s))
13458 return;
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013459 if (_PyUnicode_READY_REPLACE(p)) {
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013460 assert(0 && "_PyUnicode_READY_REPLACE fail in PyUnicode_InternInPlace");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013461 return;
13462 }
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013463 s = (PyUnicodeObject *)(*p);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013464 if (interned == NULL) {
13465 interned = PyDict_New();
13466 if (interned == NULL) {
13467 PyErr_Clear(); /* Don't leave an exception */
13468 return;
13469 }
13470 }
13471 /* It might be that the GetItem call fails even
13472 though the key is present in the dictionary,
13473 namely when this happens during a stack overflow. */
13474 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000013475 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013476 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000013477
Benjamin Peterson29060642009-01-31 22:14:21 +000013478 if (t) {
13479 Py_INCREF(t);
13480 Py_DECREF(*p);
13481 *p = t;
13482 return;
13483 }
Walter Dörwald16807132007-05-25 13:52:07 +000013484
Benjamin Peterson14339b62009-01-31 16:36:08 +000013485 PyThreadState_GET()->recursion_critical = 1;
13486 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
13487 PyErr_Clear();
13488 PyThreadState_GET()->recursion_critical = 0;
13489 return;
13490 }
13491 PyThreadState_GET()->recursion_critical = 0;
13492 /* The two references in interned are not counted by refcnt.
13493 The deallocator will take care of this */
13494 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013495 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013496}
13497
13498void
13499PyUnicode_InternImmortal(PyObject **p)
13500{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013501 PyUnicodeObject *u = (PyUnicodeObject *)*p;
13502
Benjamin Peterson14339b62009-01-31 16:36:08 +000013503 PyUnicode_InternInPlace(p);
13504 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013505 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013506 Py_INCREF(*p);
13507 }
Walter Dörwald16807132007-05-25 13:52:07 +000013508}
13509
13510PyObject *
13511PyUnicode_InternFromString(const char *cp)
13512{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013513 PyObject *s = PyUnicode_FromString(cp);
13514 if (s == NULL)
13515 return NULL;
13516 PyUnicode_InternInPlace(&s);
13517 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000013518}
13519
Alexander Belopolsky40018472011-02-26 01:02:56 +000013520void
13521_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000013522{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013523 PyObject *keys;
13524 PyUnicodeObject *s;
13525 Py_ssize_t i, n;
13526 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000013527
Benjamin Peterson14339b62009-01-31 16:36:08 +000013528 if (interned == NULL || !PyDict_Check(interned))
13529 return;
13530 keys = PyDict_Keys(interned);
13531 if (keys == NULL || !PyList_Check(keys)) {
13532 PyErr_Clear();
13533 return;
13534 }
Walter Dörwald16807132007-05-25 13:52:07 +000013535
Benjamin Peterson14339b62009-01-31 16:36:08 +000013536 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13537 detector, interned unicode strings are not forcibly deallocated;
13538 rather, we give them their stolen references back, and then clear
13539 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000013540
Benjamin Peterson14339b62009-01-31 16:36:08 +000013541 n = PyList_GET_SIZE(keys);
13542 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000013543 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013544 for (i = 0; i < n; i++) {
13545 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013546 if (PyUnicode_READY(s) == -1) {
13547 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013548 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013549 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013550 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013551 case SSTATE_NOT_INTERNED:
13552 /* XXX Shouldn't happen */
13553 break;
13554 case SSTATE_INTERNED_IMMORTAL:
13555 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013556 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013557 break;
13558 case SSTATE_INTERNED_MORTAL:
13559 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013560 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013561 break;
13562 default:
13563 Py_FatalError("Inconsistent interned string state.");
13564 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013565 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013566 }
13567 fprintf(stderr, "total size of all interned strings: "
13568 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13569 "mortal/immortal\n", mortal_size, immortal_size);
13570 Py_DECREF(keys);
13571 PyDict_Clear(interned);
13572 Py_DECREF(interned);
13573 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000013574}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013575
13576
13577/********************* Unicode Iterator **************************/
13578
13579typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013580 PyObject_HEAD
13581 Py_ssize_t it_index;
13582 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013583} unicodeiterobject;
13584
13585static void
13586unicodeiter_dealloc(unicodeiterobject *it)
13587{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013588 _PyObject_GC_UNTRACK(it);
13589 Py_XDECREF(it->it_seq);
13590 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013591}
13592
13593static int
13594unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13595{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013596 Py_VISIT(it->it_seq);
13597 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013598}
13599
13600static PyObject *
13601unicodeiter_next(unicodeiterobject *it)
13602{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013603 PyUnicodeObject *seq;
13604 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013605
Benjamin Peterson14339b62009-01-31 16:36:08 +000013606 assert(it != NULL);
13607 seq = it->it_seq;
13608 if (seq == NULL)
13609 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013610 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013611
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013612 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
13613 int kind = PyUnicode_KIND(seq);
13614 void *data = PyUnicode_DATA(seq);
13615 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
13616 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013617 if (item != NULL)
13618 ++it->it_index;
13619 return item;
13620 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013621
Benjamin Peterson14339b62009-01-31 16:36:08 +000013622 Py_DECREF(seq);
13623 it->it_seq = NULL;
13624 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013625}
13626
13627static PyObject *
13628unicodeiter_len(unicodeiterobject *it)
13629{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013630 Py_ssize_t len = 0;
13631 if (it->it_seq)
13632 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
13633 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013634}
13635
13636PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
13637
13638static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013639 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000013640 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000013641 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013642};
13643
13644PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013645 PyVarObject_HEAD_INIT(&PyType_Type, 0)
13646 "str_iterator", /* tp_name */
13647 sizeof(unicodeiterobject), /* tp_basicsize */
13648 0, /* tp_itemsize */
13649 /* methods */
13650 (destructor)unicodeiter_dealloc, /* tp_dealloc */
13651 0, /* tp_print */
13652 0, /* tp_getattr */
13653 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013654 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013655 0, /* tp_repr */
13656 0, /* tp_as_number */
13657 0, /* tp_as_sequence */
13658 0, /* tp_as_mapping */
13659 0, /* tp_hash */
13660 0, /* tp_call */
13661 0, /* tp_str */
13662 PyObject_GenericGetAttr, /* tp_getattro */
13663 0, /* tp_setattro */
13664 0, /* tp_as_buffer */
13665 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
13666 0, /* tp_doc */
13667 (traverseproc)unicodeiter_traverse, /* tp_traverse */
13668 0, /* tp_clear */
13669 0, /* tp_richcompare */
13670 0, /* tp_weaklistoffset */
13671 PyObject_SelfIter, /* tp_iter */
13672 (iternextfunc)unicodeiter_next, /* tp_iternext */
13673 unicodeiter_methods, /* tp_methods */
13674 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013675};
13676
13677static PyObject *
13678unicode_iter(PyObject *seq)
13679{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013680 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013681
Benjamin Peterson14339b62009-01-31 16:36:08 +000013682 if (!PyUnicode_Check(seq)) {
13683 PyErr_BadInternalCall();
13684 return NULL;
13685 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013686 if (PyUnicode_READY(seq) == -1)
13687 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013688 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
13689 if (it == NULL)
13690 return NULL;
13691 it->it_index = 0;
13692 Py_INCREF(seq);
13693 it->it_seq = (PyUnicodeObject *)seq;
13694 _PyObject_GC_TRACK(it);
13695 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013696}
13697
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013698#define UNIOP(x) Py_UNICODE_##x
13699#define UNIOP_t Py_UNICODE
13700#include "uniops.h"
13701#undef UNIOP
13702#undef UNIOP_t
13703#define UNIOP(x) Py_UCS4_##x
13704#define UNIOP_t Py_UCS4
13705#include "uniops.h"
13706#undef UNIOP
13707#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000013708
Victor Stinner71133ff2010-09-01 23:43:53 +000013709Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000013710PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000013711{
13712 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
13713 Py_UNICODE *copy;
13714 Py_ssize_t size;
13715
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013716 if (!PyUnicode_Check(unicode)) {
13717 PyErr_BadArgument();
13718 return NULL;
13719 }
Victor Stinner71133ff2010-09-01 23:43:53 +000013720 /* Ensure we won't overflow the size. */
13721 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
13722 PyErr_NoMemory();
13723 return NULL;
13724 }
13725 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
13726 size *= sizeof(Py_UNICODE);
13727 copy = PyMem_Malloc(size);
13728 if (copy == NULL) {
13729 PyErr_NoMemory();
13730 return NULL;
13731 }
13732 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
13733 return copy;
13734}
Martin v. Löwis5b222132007-06-10 09:51:05 +000013735
Georg Brandl66c221e2010-10-14 07:04:07 +000013736/* A _string module, to export formatter_parser and formatter_field_name_split
13737 to the string.Formatter class implemented in Python. */
13738
13739static PyMethodDef _string_methods[] = {
13740 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
13741 METH_O, PyDoc_STR("split the argument as a field name")},
13742 {"formatter_parser", (PyCFunction) formatter_parser,
13743 METH_O, PyDoc_STR("parse the argument as a format string")},
13744 {NULL, NULL}
13745};
13746
13747static struct PyModuleDef _string_module = {
13748 PyModuleDef_HEAD_INIT,
13749 "_string",
13750 PyDoc_STR("string helper module"),
13751 0,
13752 _string_methods,
13753 NULL,
13754 NULL,
13755 NULL,
13756 NULL
13757};
13758
13759PyMODINIT_FUNC
13760PyInit__string(void)
13761{
13762 return PyModule_Create(&_string_module);
13763}
13764
13765
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013766#ifdef __cplusplus
13767}
13768#endif