blob: 7abf984e66b9b21b74ffa9e92f34028873c35dc4 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
Christian Heimes5b970ad2008-02-06 13:33:44 +000054#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000055
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Christian Heimes5b970ad2008-02-06 13:33:44 +000062 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Anthony Baxterac6bd462006-04-13 02:06:09 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Christian Heimes5b970ad2008-02-06 13:33:44 +000096static PyUnicodeObject *free_list;
97static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
108
109 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000110 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000111
112*/
Fred Drakee4315f52000-05-09 19:53:39 +0000113static char unicode_default_encoding[100];
114
Christian Heimes4d4f2702008-01-30 11:32:37 +0000115/* Fast detection of the most frequent whitespace characters */
116const unsigned char _Py_ascii_whitespace[] = {
117 0, 0, 0, 0, 0, 0, 0, 0,
118// case 0x0009: /* HORIZONTAL TABULATION */
119// case 0x000A: /* LINE FEED */
120// case 0x000B: /* VERTICAL TABULATION */
121// case 0x000C: /* FORM FEED */
122// case 0x000D: /* CARRIAGE RETURN */
123 0, 1, 1, 1, 1, 1, 0, 0,
124 0, 0, 0, 0, 0, 0, 0, 0,
125// case 0x001C: /* FILE SEPARATOR */
126// case 0x001D: /* GROUP SEPARATOR */
127// case 0x001E: /* RECORD SEPARATOR */
128// case 0x001F: /* UNIT SEPARATOR */
129 0, 0, 0, 0, 1, 1, 1, 1,
130// case 0x0020: /* SPACE */
131 1, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
135
136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0
144};
145
146/* Same for linebreaks */
147static unsigned char ascii_linebreak[] = {
148 0, 0, 0, 0, 0, 0, 0, 0,
149// 0x000A, /* LINE FEED */
150// 0x000D, /* CARRIAGE RETURN */
151 0, 0, 1, 0, 0, 1, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153// 0x001C, /* FILE SEPARATOR */
154// 0x001D, /* GROUP SEPARATOR */
155// 0x001E, /* RECORD SEPARATOR */
156 0, 0, 0, 0, 1, 1, 1, 0,
157 0, 0, 0, 0, 0, 0, 0, 0,
158 0, 0, 0, 0, 0, 0, 0, 0,
159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
161
162 0, 0, 0, 0, 0, 0, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0,
164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0
170};
171
172
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000173Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000174PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000175{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000176#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000177 return 0x10FFFF;
178#else
179 /* This is actually an illegal character, so it should
180 not be passed to unichr. */
181 return 0xFFFF;
182#endif
183}
184
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000185/* --- Bloom Filters ----------------------------------------------------- */
186
187/* stuff to implement simple "bloom filters" for Unicode characters.
188 to keep things simple, we use a single bitmask, using the least 5
189 bits from each unicode characters as the bit index. */
190
191/* the linebreak mask is set up by Unicode_Init below */
192
193#define BLOOM_MASK unsigned long
194
195static BLOOM_MASK bloom_linebreak;
196
197#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
198
Christian Heimes4d4f2702008-01-30 11:32:37 +0000199#define BLOOM_LINEBREAK(ch) \
200 ((ch) < 128U ? ascii_linebreak[(ch)] : \
201 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000202
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000203Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000204{
205 /* calculate simple bloom-style bitmask for a given unicode string */
206
207 long mask;
208 Py_ssize_t i;
209
210 mask = 0;
211 for (i = 0; i < len; i++)
212 mask |= (1 << (ptr[i] & 0x1F));
213
214 return mask;
215}
216
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000217Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000218{
219 Py_ssize_t i;
220
221 for (i = 0; i < setlen; i++)
222 if (set[i] == chr)
223 return 1;
224
Fredrik Lundh77633512006-05-23 19:47:35 +0000225 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000226}
227
228#define BLOOM_MEMBER(mask, chr, set, setlen)\
229 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
230
Guido van Rossumd57fd912000-03-10 22:53:23 +0000231/* --- Unicode Object ----------------------------------------------------- */
232
233static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000234int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000235 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000236{
237 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000238
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000239 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000240 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000241 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000243 /* Resizing shared object (unicode_empty or single character
244 objects) in-place is not allowed. Use PyUnicode_Resize()
245 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000246
Martin v. Löwis80d2e592006-04-13 06:06:08 +0000247 if (unicode == unicode_empty ||
248 (unicode->length == 1 &&
249 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000250 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000251 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000252 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 return -1;
254 }
255
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000256 /* We allocate one more byte to make sure the string is Ux0000 terminated.
257 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000258 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000259 it contains). */
260
Guido van Rossumd57fd912000-03-10 22:53:23 +0000261 oldstr = unicode->str;
Neal Norwitz419fd492008-03-17 20:22:43 +0000262 unicode->str = PyObject_REALLOC(unicode->str,
263 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264 if (!unicode->str) {
Anthony Baxtera6286212006-04-11 07:42:36 +0000265 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 PyErr_NoMemory();
267 return -1;
268 }
269 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000270 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000271
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000272 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000273 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000274 if (unicode->defenc) {
275 Py_DECREF(unicode->defenc);
276 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 }
278 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000279
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 return 0;
281}
282
283/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000284 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000285
286 XXX This allocator could further be enhanced by assuring that the
287 free list never reduces its size below 1.
288
289*/
290
291static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000292PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000293{
294 register PyUnicodeObject *unicode;
295
Andrew Dalkee0df7622006-05-27 11:04:36 +0000296 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297 if (length == 0 && unicode_empty != NULL) {
298 Py_INCREF(unicode_empty);
299 return unicode_empty;
300 }
301
Neal Norwitze7d8be82008-07-31 17:17:14 +0000302 /* Ensure we won't overflow the size. */
303 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
304 return (PyUnicodeObject *)PyErr_NoMemory();
305 }
306
Guido van Rossumd57fd912000-03-10 22:53:23 +0000307 /* Unicode freelist & memory allocation */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000308 if (free_list) {
309 unicode = free_list;
310 free_list = *(PyUnicodeObject **)unicode;
311 numfree--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000312 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000313 /* Keep-Alive optimization: we only upsize the buffer,
314 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000315 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000316 unicode_resize(unicode, length) < 0) {
Neal Norwitz419fd492008-03-17 20:22:43 +0000317 PyObject_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000318 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000319 }
320 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000321 else {
Neal Norwitz419fd492008-03-17 20:22:43 +0000322 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
323 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000324 }
325 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000326 }
327 else {
Neal Norwitz419fd492008-03-17 20:22:43 +0000328 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000329 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000330 if (unicode == NULL)
331 return NULL;
Neal Norwitz419fd492008-03-17 20:22:43 +0000332 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
333 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000334 }
335
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000336 if (!unicode->str) {
337 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000338 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000339 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000340 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000341 * the caller fails before initializing str -- unicode_resize()
342 * reads str[0], and the Keep-Alive optimization can keep memory
343 * allocated for str alive across a call to unicode_dealloc(unicode).
344 * We don't want unicode_resize to read uninitialized memory in
345 * that case.
346 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000347 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000348 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000349 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000350 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000351 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000352 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000353
354 onError:
355 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000356 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000357 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000358}
359
360static
Guido van Rossum9475a232001-10-05 20:51:39 +0000361void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000362{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000363 if (PyUnicode_CheckExact(unicode) &&
Christian Heimes5b970ad2008-02-06 13:33:44 +0000364 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000365 /* Keep-Alive optimization */
366 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Neal Norwitz419fd492008-03-17 20:22:43 +0000367 PyObject_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000368 unicode->str = NULL;
369 unicode->length = 0;
370 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000371 if (unicode->defenc) {
372 Py_DECREF(unicode->defenc);
373 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000374 }
375 /* Add to free list */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000376 *(PyUnicodeObject **)unicode = free_list;
377 free_list = unicode;
378 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000379 }
380 else {
Neal Norwitz419fd492008-03-17 20:22:43 +0000381 PyObject_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000382 Py_XDECREF(unicode->defenc);
Christian Heimese93237d2007-12-19 02:37:44 +0000383 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000384 }
385}
386
Martin v. Löwis18e16552006-02-15 17:27:45 +0000387int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000388{
389 register PyUnicodeObject *v;
390
391 /* Argument checks */
392 if (unicode == NULL) {
393 PyErr_BadInternalCall();
394 return -1;
395 }
396 v = (PyUnicodeObject *)*unicode;
Christian Heimese93237d2007-12-19 02:37:44 +0000397 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000398 PyErr_BadInternalCall();
399 return -1;
400 }
401
402 /* Resizing unicode_empty and single character objects is not
403 possible since these are being shared. We simply return a fresh
404 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000405 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000406 (v == unicode_empty || v->length == 1)) {
407 PyUnicodeObject *w = _PyUnicode_New(length);
408 if (w == NULL)
409 return -1;
410 Py_UNICODE_COPY(w->str, v->str,
411 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000412 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000413 *unicode = (PyObject *)w;
414 return 0;
415 }
416
417 /* Note that we don't have to modify *unicode for unshared Unicode
418 objects, since we can modify them in-place. */
419 return unicode_resize(v, length);
420}
421
422/* Internal API for use in unicodeobject.c only ! */
423#define _PyUnicode_Resize(unicodevar, length) \
424 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
425
Guido van Rossumd57fd912000-03-10 22:53:23 +0000426PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000427 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000428{
429 PyUnicodeObject *unicode;
430
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000431 /* If the Unicode data is known at construction time, we can apply
432 some optimizations which share commonly used objects. */
433 if (u != NULL) {
434
435 /* Optimization for empty strings */
436 if (size == 0 && unicode_empty != NULL) {
437 Py_INCREF(unicode_empty);
438 return (PyObject *)unicode_empty;
439 }
440
441 /* Single character Unicode objects in the Latin-1 range are
442 shared when using this constructor */
443 if (size == 1 && *u < 256) {
444 unicode = unicode_latin1[*u];
445 if (!unicode) {
446 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000447 if (!unicode)
448 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000449 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000450 unicode_latin1[*u] = unicode;
451 }
452 Py_INCREF(unicode);
453 return (PyObject *)unicode;
454 }
455 }
Tim Petersced69f82003-09-16 20:30:58 +0000456
Guido van Rossumd57fd912000-03-10 22:53:23 +0000457 unicode = _PyUnicode_New(size);
458 if (!unicode)
459 return NULL;
460
461 /* Copy the Unicode data into the new object */
462 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000463 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000464
465 return (PyObject *)unicode;
466}
467
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000468PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
469{
470 PyUnicodeObject *unicode;
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000471
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000472 if (size < 0) {
473 PyErr_SetString(PyExc_SystemError,
474 "Negative size passed to PyUnicode_FromStringAndSize");
475 return NULL;
476 }
477
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000478 /* If the Unicode data is known at construction time, we can apply
479 some optimizations which share commonly used objects.
480 Also, this means the input must be UTF-8, so fall back to the
481 UTF-8 decoder at the end. */
482 if (u != NULL) {
483
484 /* Optimization for empty strings */
485 if (size == 0 && unicode_empty != NULL) {
486 Py_INCREF(unicode_empty);
487 return (PyObject *)unicode_empty;
488 }
489
490 /* Single characters are shared when using this constructor.
491 Restrict to ASCII, since the input must be UTF-8. */
492 if (size == 1 && Py_CHARMASK(*u) < 128) {
Neal Norwitzd183bdd2008-03-28 04:58:51 +0000493 unicode = unicode_latin1[Py_CHARMASK(*u)];
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000494 if (!unicode) {
495 unicode = _PyUnicode_New(1);
496 if (!unicode)
497 return NULL;
498 unicode->str[0] = Py_CHARMASK(*u);
Neal Norwitzd183bdd2008-03-28 04:58:51 +0000499 unicode_latin1[Py_CHARMASK(*u)] = unicode;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000500 }
501 Py_INCREF(unicode);
502 return (PyObject *)unicode;
503 }
504
505 return PyUnicode_DecodeUTF8(u, size, NULL);
506 }
507
508 unicode = _PyUnicode_New(size);
509 if (!unicode)
510 return NULL;
511
512 return (PyObject *)unicode;
513}
514
515PyObject *PyUnicode_FromString(const char *u)
516{
517 size_t size = strlen(u);
518 if (size > PY_SSIZE_T_MAX) {
519 PyErr_SetString(PyExc_OverflowError, "input too long");
520 return NULL;
521 }
522
523 return PyUnicode_FromStringAndSize(u, size);
524}
525
Guido van Rossumd57fd912000-03-10 22:53:23 +0000526#ifdef HAVE_WCHAR_H
527
528PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000529 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000530{
531 PyUnicodeObject *unicode;
532
533 if (w == NULL) {
534 PyErr_BadInternalCall();
535 return NULL;
536 }
537
538 unicode = _PyUnicode_New(size);
539 if (!unicode)
540 return NULL;
541
542 /* Copy the wchar_t data into the new object */
543#ifdef HAVE_USABLE_WCHAR_T
544 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000545#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000546 {
547 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000548 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000549 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000550 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000551 *u++ = *w++;
552 }
553#endif
554
555 return (PyObject *)unicode;
556}
557
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000558static void
559makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
560{
561 *fmt++ = '%';
562 if (width) {
563 if (zeropad)
564 *fmt++ = '0';
565 fmt += sprintf(fmt, "%d", width);
566 }
567 if (precision)
568 fmt += sprintf(fmt, ".%d", precision);
569 if (longflag)
570 *fmt++ = 'l';
571 else if (size_tflag) {
572 char *f = PY_FORMAT_SIZE_T;
573 while (*f)
574 *fmt++ = *f++;
575 }
576 *fmt++ = c;
577 *fmt = '\0';
578}
579
580#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
581
582PyObject *
583PyUnicode_FromFormatV(const char *format, va_list vargs)
584{
585 va_list count;
586 Py_ssize_t callcount = 0;
587 PyObject **callresults = NULL;
588 PyObject **callresult = NULL;
589 Py_ssize_t n = 0;
590 int width = 0;
591 int precision = 0;
592 int zeropad;
593 const char* f;
594 Py_UNICODE *s;
595 PyObject *string;
596 /* used by sprintf */
597 char buffer[21];
598 /* use abuffer instead of buffer, if we need more space
599 * (which can happen if there's a format specifier with width). */
600 char *abuffer = NULL;
601 char *realbuffer;
602 Py_ssize_t abuffersize = 0;
603 char fmt[60]; /* should be enough for %0width.precisionld */
604 const char *copy;
605
606#ifdef VA_LIST_IS_ARRAY
607 Py_MEMCPY(count, vargs, sizeof(va_list));
608#else
609#ifdef __va_copy
610 __va_copy(count, vargs);
611#else
612 count = vargs;
613#endif
614#endif
615 /* step 1: count the number of %S/%R format specifications
616 * (we call PyObject_Str()/PyObject_Repr() for these objects
617 * once during step 3 and put the result in an array) */
618 for (f = format; *f; f++) {
619 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
620 ++callcount;
621 }
622 /* step 2: allocate memory for the results of
623 * PyObject_Str()/PyObject_Repr() calls */
624 if (callcount) {
Neal Norwitz419fd492008-03-17 20:22:43 +0000625 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000626 if (!callresults) {
627 PyErr_NoMemory();
628 return NULL;
629 }
630 callresult = callresults;
631 }
632 /* step 3: figure out how large a buffer we need */
633 for (f = format; *f; f++) {
634 if (*f == '%') {
635 const char* p = f;
636 width = 0;
Neal Norwitzade57d02008-03-23 06:19:57 +0000637 while (isdigit((unsigned)*f))
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000638 width = (width*10) + *f++ - '0';
Neal Norwitzade57d02008-03-23 06:19:57 +0000639 while (*++f && *f != '%' && !isalpha((unsigned)*f))
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000640 ;
641
642 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
643 * they don't affect the amount of space we reserve.
644 */
645 if ((*f == 'l' || *f == 'z') &&
646 (f[1] == 'd' || f[1] == 'u'))
647 ++f;
648
649 switch (*f) {
650 case 'c':
651 (void)va_arg(count, int);
652 /* fall through... */
653 case '%':
654 n++;
655 break;
656 case 'd': case 'u': case 'i': case 'x':
657 (void) va_arg(count, int);
658 /* 20 bytes is enough to hold a 64-bit
659 integer. Decimal takes the most space.
660 This isn't enough for octal.
661 If a width is specified we need more
662 (which we allocate later). */
663 if (width < 20)
664 width = 20;
665 n += width;
666 if (abuffersize < width)
667 abuffersize = width;
668 break;
669 case 's':
670 {
671 /* UTF-8 */
672 unsigned char*s;
673 s = va_arg(count, unsigned char*);
674 while (*s) {
675 if (*s < 128) {
676 n++; s++;
677 } else if (*s < 0xc0) {
678 /* invalid UTF-8 */
679 n++; s++;
680 } else if (*s < 0xc0) {
681 n++;
682 s++; if(!*s)break;
683 s++;
684 } else if (*s < 0xe0) {
685 n++;
686 s++; if(!*s)break;
687 s++; if(!*s)break;
688 s++;
689 } else {
690 #ifdef Py_UNICODE_WIDE
691 n++;
692 #else
693 n+=2;
694 #endif
695 s++; if(!*s)break;
696 s++; if(!*s)break;
697 s++; if(!*s)break;
698 s++;
699 }
700 }
701 break;
702 }
703 case 'U':
704 {
705 PyObject *obj = va_arg(count, PyObject *);
706 assert(obj && PyUnicode_Check(obj));
707 n += PyUnicode_GET_SIZE(obj);
708 break;
709 }
710 case 'V':
711 {
712 PyObject *obj = va_arg(count, PyObject *);
713 const char *str = va_arg(count, const char *);
714 assert(obj || str);
715 assert(!obj || PyUnicode_Check(obj));
716 if (obj)
717 n += PyUnicode_GET_SIZE(obj);
718 else
719 n += strlen(str);
720 break;
721 }
722 case 'S':
723 {
724 PyObject *obj = va_arg(count, PyObject *);
725 PyObject *str;
726 assert(obj);
727 str = PyObject_Str(obj);
728 if (!str)
729 goto fail;
730 n += PyUnicode_GET_SIZE(str);
731 /* Remember the str and switch to the next slot */
732 *callresult++ = str;
733 break;
734 }
735 case 'R':
736 {
737 PyObject *obj = va_arg(count, PyObject *);
738 PyObject *repr;
739 assert(obj);
740 repr = PyObject_Repr(obj);
741 if (!repr)
742 goto fail;
743 n += PyUnicode_GET_SIZE(repr);
744 /* Remember the repr and switch to the next slot */
745 *callresult++ = repr;
746 break;
747 }
748 case 'p':
749 (void) va_arg(count, int);
750 /* maximum 64-bit pointer representation:
751 * 0xffffffffffffffff
752 * so 19 characters is enough.
753 * XXX I count 18 -- what's the extra for?
754 */
755 n += 19;
756 break;
757 default:
758 /* if we stumble upon an unknown
759 formatting code, copy the rest of
760 the format string to the output
761 string. (we cannot just skip the
762 code, since there's no way to know
763 what's in the argument list) */
764 n += strlen(p);
765 goto expand;
766 }
767 } else
768 n++;
769 }
770 expand:
771 if (abuffersize > 20) {
Neal Norwitz419fd492008-03-17 20:22:43 +0000772 abuffer = PyObject_Malloc(abuffersize);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000773 if (!abuffer) {
774 PyErr_NoMemory();
775 goto fail;
776 }
777 realbuffer = abuffer;
778 }
779 else
780 realbuffer = buffer;
781 /* step 4: fill the buffer */
782 /* Since we've analyzed how much space we need for the worst case,
783 we don't have to resize the string.
784 There can be no errors beyond this point. */
785 string = PyUnicode_FromUnicode(NULL, n);
786 if (!string)
787 goto fail;
788
789 s = PyUnicode_AS_UNICODE(string);
790 callresult = callresults;
791
792 for (f = format; *f; f++) {
793 if (*f == '%') {
794 const char* p = f++;
795 int longflag = 0;
796 int size_tflag = 0;
797 zeropad = (*f == '0');
798 /* parse the width.precision part */
799 width = 0;
Neal Norwitzade57d02008-03-23 06:19:57 +0000800 while (isdigit((unsigned)*f))
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000801 width = (width*10) + *f++ - '0';
802 precision = 0;
803 if (*f == '.') {
804 f++;
Neal Norwitzade57d02008-03-23 06:19:57 +0000805 while (isdigit((unsigned)*f))
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000806 precision = (precision*10) + *f++ - '0';
807 }
808 /* handle the long flag, but only for %ld and %lu.
809 others can be added when necessary. */
810 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
811 longflag = 1;
812 ++f;
813 }
814 /* handle the size_t flag. */
815 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
816 size_tflag = 1;
817 ++f;
818 }
819
820 switch (*f) {
821 case 'c':
822 *s++ = va_arg(vargs, int);
823 break;
824 case 'd':
825 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
826 if (longflag)
827 sprintf(realbuffer, fmt, va_arg(vargs, long));
828 else if (size_tflag)
829 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
830 else
831 sprintf(realbuffer, fmt, va_arg(vargs, int));
832 appendstring(realbuffer);
833 break;
834 case 'u':
835 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
836 if (longflag)
837 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
838 else if (size_tflag)
839 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
840 else
841 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
842 appendstring(realbuffer);
843 break;
844 case 'i':
845 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
846 sprintf(realbuffer, fmt, va_arg(vargs, int));
847 appendstring(realbuffer);
848 break;
849 case 'x':
850 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
851 sprintf(realbuffer, fmt, va_arg(vargs, int));
852 appendstring(realbuffer);
853 break;
854 case 's':
855 {
856 /* Parameter must be UTF-8 encoded.
857 In case of encoding errors, use
858 the replacement character. */
859 PyObject *u;
860 p = va_arg(vargs, char*);
861 u = PyUnicode_DecodeUTF8(p, strlen(p),
862 "replace");
863 if (!u)
864 goto fail;
865 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
866 PyUnicode_GET_SIZE(u));
867 s += PyUnicode_GET_SIZE(u);
868 Py_DECREF(u);
869 break;
870 }
871 case 'U':
872 {
873 PyObject *obj = va_arg(vargs, PyObject *);
874 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
875 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
876 s += size;
877 break;
878 }
879 case 'V':
880 {
881 PyObject *obj = va_arg(vargs, PyObject *);
882 const char *str = va_arg(vargs, const char *);
883 if (obj) {
884 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
885 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
886 s += size;
887 } else {
888 appendstring(str);
889 }
890 break;
891 }
892 case 'S':
893 case 'R':
894 {
895 Py_UNICODE *ucopy;
896 Py_ssize_t usize;
897 Py_ssize_t upos;
898 /* unused, since we already have the result */
899 (void) va_arg(vargs, PyObject *);
900 ucopy = PyUnicode_AS_UNICODE(*callresult);
901 usize = PyUnicode_GET_SIZE(*callresult);
902 for (upos = 0; upos<usize;)
903 *s++ = ucopy[upos++];
904 /* We're done with the unicode()/repr() => forget it */
905 Py_DECREF(*callresult);
906 /* switch to next unicode()/repr() result */
907 ++callresult;
908 break;
909 }
910 case 'p':
911 sprintf(buffer, "%p", va_arg(vargs, void*));
912 /* %p is ill-defined: ensure leading 0x. */
913 if (buffer[1] == 'X')
914 buffer[1] = 'x';
915 else if (buffer[1] != 'x') {
916 memmove(buffer+2, buffer, strlen(buffer)+1);
917 buffer[0] = '0';
918 buffer[1] = 'x';
919 }
920 appendstring(buffer);
921 break;
922 case '%':
923 *s++ = '%';
924 break;
925 default:
926 appendstring(p);
927 goto end;
928 }
929 } else
930 *s++ = *f;
931 }
932
933 end:
934 if (callresults)
Neal Norwitz419fd492008-03-17 20:22:43 +0000935 PyObject_Free(callresults);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000936 if (abuffer)
Neal Norwitz419fd492008-03-17 20:22:43 +0000937 PyObject_Free(abuffer);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000938 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
939 return string;
940 fail:
941 if (callresults) {
942 PyObject **callresult2 = callresults;
943 while (callresult2 < callresult) {
944 Py_DECREF(*callresult2);
945 ++callresult2;
946 }
Neal Norwitz419fd492008-03-17 20:22:43 +0000947 PyObject_Free(callresults);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000948 }
949 if (abuffer)
Neal Norwitz419fd492008-03-17 20:22:43 +0000950 PyObject_Free(abuffer);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000951 return NULL;
952}
953
954#undef appendstring
955
956PyObject *
957PyUnicode_FromFormat(const char *format, ...)
958{
959 PyObject* ret;
960 va_list vargs;
961
962#ifdef HAVE_STDARG_PROTOTYPES
963 va_start(vargs, format);
964#else
965 va_start(vargs);
966#endif
967 ret = PyUnicode_FromFormatV(format, vargs);
968 va_end(vargs);
969 return ret;
970}
971
Martin v. Löwis18e16552006-02-15 17:27:45 +0000972Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
973 wchar_t *w,
974 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000975{
976 if (unicode == NULL) {
977 PyErr_BadInternalCall();
978 return -1;
979 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000980
981 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000982 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000983 size = PyUnicode_GET_SIZE(unicode) + 1;
984
Guido van Rossumd57fd912000-03-10 22:53:23 +0000985#ifdef HAVE_USABLE_WCHAR_T
986 memcpy(w, unicode->str, size * sizeof(wchar_t));
987#else
988 {
989 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000990 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000991 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000992 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000993 *w++ = *u++;
994 }
995#endif
996
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000997 if (size > PyUnicode_GET_SIZE(unicode))
998 return PyUnicode_GET_SIZE(unicode);
999 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00001000 return size;
1001}
1002
1003#endif
1004
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001005PyObject *PyUnicode_FromOrdinal(int ordinal)
1006{
Hye-Shik Chang40574832004-04-06 07:24:51 +00001007 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001008
1009#ifdef Py_UNICODE_WIDE
1010 if (ordinal < 0 || ordinal > 0x10ffff) {
1011 PyErr_SetString(PyExc_ValueError,
1012 "unichr() arg not in range(0x110000) "
1013 "(wide Python build)");
1014 return NULL;
1015 }
1016#else
1017 if (ordinal < 0 || ordinal > 0xffff) {
1018 PyErr_SetString(PyExc_ValueError,
1019 "unichr() arg not in range(0x10000) "
1020 "(narrow Python build)");
1021 return NULL;
1022 }
1023#endif
1024
Hye-Shik Chang40574832004-04-06 07:24:51 +00001025 s[0] = (Py_UNICODE)ordinal;
1026 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001027}
1028
Guido van Rossumd57fd912000-03-10 22:53:23 +00001029PyObject *PyUnicode_FromObject(register PyObject *obj)
1030{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001031 /* XXX Perhaps we should make this API an alias of
1032 PyObject_Unicode() instead ?! */
1033 if (PyUnicode_CheckExact(obj)) {
1034 Py_INCREF(obj);
1035 return obj;
1036 }
1037 if (PyUnicode_Check(obj)) {
1038 /* For a Unicode subtype that's not a Unicode object,
1039 return a true Unicode object with the same data. */
1040 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1041 PyUnicode_GET_SIZE(obj));
1042 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001043 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1044}
1045
1046PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1047 const char *encoding,
1048 const char *errors)
1049{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001050 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001051 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001052 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001053
Guido van Rossumd57fd912000-03-10 22:53:23 +00001054 if (obj == NULL) {
1055 PyErr_BadInternalCall();
1056 return NULL;
1057 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001058
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001059#if 0
1060 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001061 that no encodings is given and then redirect to
1062 PyObject_Unicode() which then applies the additional logic for
1063 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001064
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001065 NOTE: This API should really only be used for object which
1066 represent *encoded* Unicode !
1067
1068 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001069 if (PyUnicode_Check(obj)) {
1070 if (encoding) {
1071 PyErr_SetString(PyExc_TypeError,
1072 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001073 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001074 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001075 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001076 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001077#else
1078 if (PyUnicode_Check(obj)) {
1079 PyErr_SetString(PyExc_TypeError,
1080 "decoding Unicode is not supported");
1081 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001082 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001083#endif
1084
1085 /* Coerce object */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001086 if (PyString_Check(obj)) {
1087 s = PyString_AS_STRING(obj);
1088 len = PyString_GET_SIZE(obj);
Christian Heimes1a6387e2008-03-26 12:49:49 +00001089 }
Christian Heimes3497f942008-05-26 12:29:14 +00001090 else if (PyByteArray_Check(obj)) {
Christian Heimes1a6387e2008-03-26 12:49:49 +00001091 /* Python 2.x specific */
1092 PyErr_Format(PyExc_TypeError,
1093 "decoding bytearray is not supported");
1094 return NULL;
1095 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001096 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1097 /* Overwrite the error message with something more useful in
1098 case of a TypeError. */
1099 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001100 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001101 "coercing to Unicode: need string or buffer, "
1102 "%.80s found",
Christian Heimese93237d2007-12-19 02:37:44 +00001103 Py_TYPE(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001104 goto onError;
1105 }
Tim Petersced69f82003-09-16 20:30:58 +00001106
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001107 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001108 if (len == 0) {
1109 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001110 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001111 }
Tim Petersced69f82003-09-16 20:30:58 +00001112 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001113 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001114
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001115 return v;
1116
1117 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001118 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001119}
1120
1121PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001122 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001123 const char *encoding,
1124 const char *errors)
1125{
1126 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001127
1128 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001129 encoding = PyUnicode_GetDefaultEncoding();
1130
1131 /* Shortcuts for common default encodings */
1132 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001133 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001134 else if (strcmp(encoding, "latin-1") == 0)
1135 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001136#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1137 else if (strcmp(encoding, "mbcs") == 0)
1138 return PyUnicode_DecodeMBCS(s, size, errors);
1139#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001140 else if (strcmp(encoding, "ascii") == 0)
1141 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001142
1143 /* Decode via the codec registry */
1144 buffer = PyBuffer_FromMemory((void *)s, size);
1145 if (buffer == NULL)
1146 goto onError;
1147 unicode = PyCodec_Decode(buffer, encoding, errors);
1148 if (unicode == NULL)
1149 goto onError;
1150 if (!PyUnicode_Check(unicode)) {
1151 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001152 "decoder did not return an unicode object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001153 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001154 Py_DECREF(unicode);
1155 goto onError;
1156 }
1157 Py_DECREF(buffer);
1158 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001159
Guido van Rossumd57fd912000-03-10 22:53:23 +00001160 onError:
1161 Py_XDECREF(buffer);
1162 return NULL;
1163}
1164
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001165PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1166 const char *encoding,
1167 const char *errors)
1168{
1169 PyObject *v;
1170
1171 if (!PyUnicode_Check(unicode)) {
1172 PyErr_BadArgument();
1173 goto onError;
1174 }
1175
1176 if (encoding == NULL)
1177 encoding = PyUnicode_GetDefaultEncoding();
1178
1179 /* Decode via the codec registry */
1180 v = PyCodec_Decode(unicode, encoding, errors);
1181 if (v == NULL)
1182 goto onError;
1183 return v;
1184
1185 onError:
1186 return NULL;
1187}
1188
Guido van Rossumd57fd912000-03-10 22:53:23 +00001189PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001190 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001191 const char *encoding,
1192 const char *errors)
1193{
1194 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001195
Guido van Rossumd57fd912000-03-10 22:53:23 +00001196 unicode = PyUnicode_FromUnicode(s, size);
1197 if (unicode == NULL)
1198 return NULL;
1199 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1200 Py_DECREF(unicode);
1201 return v;
1202}
1203
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001204PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1205 const char *encoding,
1206 const char *errors)
1207{
1208 PyObject *v;
1209
1210 if (!PyUnicode_Check(unicode)) {
1211 PyErr_BadArgument();
1212 goto onError;
1213 }
1214
1215 if (encoding == NULL)
1216 encoding = PyUnicode_GetDefaultEncoding();
1217
1218 /* Encode via the codec registry */
1219 v = PyCodec_Encode(unicode, encoding, errors);
1220 if (v == NULL)
1221 goto onError;
1222 return v;
1223
1224 onError:
1225 return NULL;
1226}
1227
Guido van Rossumd57fd912000-03-10 22:53:23 +00001228PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1229 const char *encoding,
1230 const char *errors)
1231{
1232 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001233
Guido van Rossumd57fd912000-03-10 22:53:23 +00001234 if (!PyUnicode_Check(unicode)) {
1235 PyErr_BadArgument();
1236 goto onError;
1237 }
Fred Drakee4315f52000-05-09 19:53:39 +00001238
Tim Petersced69f82003-09-16 20:30:58 +00001239 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001240 encoding = PyUnicode_GetDefaultEncoding();
1241
1242 /* Shortcuts for common default encodings */
1243 if (errors == NULL) {
1244 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001245 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001246 else if (strcmp(encoding, "latin-1") == 0)
1247 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001248#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1249 else if (strcmp(encoding, "mbcs") == 0)
1250 return PyUnicode_AsMBCSString(unicode);
1251#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001252 else if (strcmp(encoding, "ascii") == 0)
1253 return PyUnicode_AsASCIIString(unicode);
1254 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001255
1256 /* Encode via the codec registry */
1257 v = PyCodec_Encode(unicode, encoding, errors);
1258 if (v == NULL)
1259 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001260 if (!PyString_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001261 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001262 "encoder did not return a string object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001263 Py_TYPE(v)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001264 Py_DECREF(v);
1265 goto onError;
1266 }
1267 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001268
Guido van Rossumd57fd912000-03-10 22:53:23 +00001269 onError:
1270 return NULL;
1271}
1272
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001273PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1274 const char *errors)
1275{
1276 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1277
1278 if (v)
1279 return v;
1280 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1281 if (v && errors == NULL)
1282 ((PyUnicodeObject *)unicode)->defenc = v;
1283 return v;
1284}
1285
Guido van Rossumd57fd912000-03-10 22:53:23 +00001286Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1287{
1288 if (!PyUnicode_Check(unicode)) {
1289 PyErr_BadArgument();
1290 goto onError;
1291 }
1292 return PyUnicode_AS_UNICODE(unicode);
1293
1294 onError:
1295 return NULL;
1296}
1297
Martin v. Löwis18e16552006-02-15 17:27:45 +00001298Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001299{
1300 if (!PyUnicode_Check(unicode)) {
1301 PyErr_BadArgument();
1302 goto onError;
1303 }
1304 return PyUnicode_GET_SIZE(unicode);
1305
1306 onError:
1307 return -1;
1308}
1309
Thomas Wouters78890102000-07-22 19:25:51 +00001310const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001311{
1312 return unicode_default_encoding;
1313}
1314
1315int PyUnicode_SetDefaultEncoding(const char *encoding)
1316{
1317 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001318
Fred Drakee4315f52000-05-09 19:53:39 +00001319 /* Make sure the encoding is valid. As side effect, this also
1320 loads the encoding into the codec registry cache. */
1321 v = _PyCodec_Lookup(encoding);
1322 if (v == NULL)
1323 goto onError;
1324 Py_DECREF(v);
1325 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +00001326 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +00001327 sizeof(unicode_default_encoding));
1328 return 0;
1329
1330 onError:
1331 return -1;
1332}
1333
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001334/* error handling callback helper:
1335 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001336 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001337 and adjust various state variables.
1338 return 0 on success, -1 on error
1339*/
1340
1341static
1342int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1343 const char *encoding, const char *reason,
Walter Dörwald87578782007-08-30 15:30:09 +00001344 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1345 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001346 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001347{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001348 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001349
1350 PyObject *restuple = NULL;
1351 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001352 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1353 Py_ssize_t requiredsize;
1354 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001355 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001356 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001357 int res = -1;
1358
1359 if (*errorHandler == NULL) {
1360 *errorHandler = PyCodec_LookupError(errors);
1361 if (*errorHandler == NULL)
1362 goto onError;
1363 }
1364
1365 if (*exceptionObject == NULL) {
1366 *exceptionObject = PyUnicodeDecodeError_Create(
1367 encoding, input, insize, *startinpos, *endinpos, reason);
1368 if (*exceptionObject == NULL)
1369 goto onError;
1370 }
1371 else {
1372 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1373 goto onError;
1374 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1375 goto onError;
1376 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1377 goto onError;
1378 }
1379
1380 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1381 if (restuple == NULL)
1382 goto onError;
1383 if (!PyTuple_Check(restuple)) {
1384 PyErr_Format(PyExc_TypeError, &argparse[4]);
1385 goto onError;
1386 }
1387 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1388 goto onError;
1389 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001390 newpos = insize+newpos;
1391 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001392 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001393 goto onError;
1394 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001395
1396 /* need more space? (at least enough for what we
1397 have+the replacement+the rest of the string (starting
1398 at the new input position), so we won't have to check space
1399 when there are no errors in the rest of the string) */
1400 repptr = PyUnicode_AS_UNICODE(repunicode);
1401 repsize = PyUnicode_GET_SIZE(repunicode);
1402 requiredsize = *outpos + repsize + insize-newpos;
1403 if (requiredsize > outsize) {
1404 if (requiredsize<2*outsize)
1405 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001406 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001407 goto onError;
1408 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1409 }
1410 *endinpos = newpos;
1411 *inptr = input + newpos;
1412 Py_UNICODE_COPY(*outptr, repptr, repsize);
1413 *outptr += repsize;
1414 *outpos += repsize;
1415 /* we made it! */
1416 res = 0;
1417
1418 onError:
1419 Py_XDECREF(restuple);
1420 return res;
1421}
1422
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001423/* --- UTF-7 Codec -------------------------------------------------------- */
1424
1425/* see RFC2152 for details */
1426
Tim Petersced69f82003-09-16 20:30:58 +00001427static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001428char utf7_special[128] = {
1429 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1430 encoded:
1431 0 - not special
1432 1 - special
1433 2 - whitespace (optional)
1434 3 - RFC2152 Set O (optional) */
1435 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1436 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1437 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1438 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1439 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1440 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1441 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1442 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1443
1444};
1445
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001446/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1447 warnings about the comparison always being false; since
1448 utf7_special[0] is 1, we can safely make that one comparison
1449 true */
1450
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001451#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001452 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001453 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001454 (encodeO && (utf7_special[(c)] == 3)))
1455
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001456#define B64(n) \
1457 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1458#define B64CHAR(c) \
1459 (isalnum(c) || (c) == '+' || (c) == '/')
1460#define UB64(c) \
1461 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1462 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001463
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001464#define ENCODE(out, ch, bits) \
1465 while (bits >= 6) { \
1466 *out++ = B64(ch >> (bits-6)); \
1467 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001468 }
1469
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001470#define DECODE(out, ch, bits, surrogate) \
1471 while (bits >= 16) { \
1472 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1473 bits -= 16; \
1474 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001475 /* We have already generated an error for the high surrogate \
1476 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001477 surrogate = 0; \
1478 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001479 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001480 it in a 16-bit character */ \
1481 surrogate = 1; \
1482 errmsg = "code pairs are not supported"; \
1483 goto utf7Error; \
1484 } else { \
1485 *out++ = outCh; \
1486 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001487 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001488
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001489PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001490 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001491 const char *errors)
1492{
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001493 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1494}
1495
1496PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1497 Py_ssize_t size,
1498 const char *errors,
1499 Py_ssize_t *consumed)
1500{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001501 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001502 Py_ssize_t startinpos;
1503 Py_ssize_t endinpos;
1504 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001505 const char *e;
1506 PyUnicodeObject *unicode;
1507 Py_UNICODE *p;
1508 const char *errmsg = "";
1509 int inShift = 0;
1510 unsigned int bitsleft = 0;
1511 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001512 int surrogate = 0;
1513 PyObject *errorHandler = NULL;
1514 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001515
1516 unicode = _PyUnicode_New(size);
1517 if (!unicode)
1518 return NULL;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001519 if (size == 0) {
1520 if (consumed)
1521 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001522 return (PyObject *)unicode;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001523 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001524
1525 p = unicode->str;
1526 e = s + size;
1527
1528 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001529 Py_UNICODE ch;
1530 restart:
Antoine Pitrou4982d5d2008-07-25 17:45:59 +00001531 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001532
1533 if (inShift) {
1534 if ((ch == '-') || !B64CHAR(ch)) {
1535 inShift = 0;
1536 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001537
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001538 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1539 if (bitsleft >= 6) {
1540 /* The shift sequence has a partial character in it. If
1541 bitsleft < 6 then we could just classify it as padding
1542 but that is not the case here */
1543
1544 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001545 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001546 }
1547 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001548 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001549 here so indicate the potential of a misencoded character. */
1550
1551 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1552 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1553 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001554 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001555 }
1556
1557 if (ch == '-') {
1558 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001559 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001560 inShift = 1;
1561 }
1562 } else if (SPECIAL(ch,0,0)) {
1563 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001564 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001565 } else {
1566 *p++ = ch;
1567 }
1568 } else {
1569 charsleft = (charsleft << 6) | UB64(ch);
1570 bitsleft += 6;
1571 s++;
1572 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1573 }
1574 }
1575 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001576 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001577 s++;
1578 if (s < e && *s == '-') {
1579 s++;
1580 *p++ = '+';
1581 } else
1582 {
1583 inShift = 1;
1584 bitsleft = 0;
1585 }
1586 }
1587 else if (SPECIAL(ch,0,0)) {
Walter Dörwald9d045422007-08-30 15:34:55 +00001588 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001589 errmsg = "unexpected special character";
1590 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001591 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001592 }
1593 else {
1594 *p++ = ch;
1595 s++;
1596 }
1597 continue;
1598 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001599 outpos = p-PyUnicode_AS_UNICODE(unicode);
1600 endinpos = s-starts;
1601 if (unicode_decode_call_errorhandler(
1602 errors, &errorHandler,
1603 "utf7", errmsg,
1604 starts, size, &startinpos, &endinpos, &exc, &s,
1605 (PyObject **)&unicode, &outpos, &p))
1606 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001607 }
1608
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001609 if (inShift && !consumed) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001610 outpos = p-PyUnicode_AS_UNICODE(unicode);
1611 endinpos = size;
1612 if (unicode_decode_call_errorhandler(
1613 errors, &errorHandler,
1614 "utf7", "unterminated shift sequence",
1615 starts, size, &startinpos, &endinpos, &exc, &s,
1616 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001617 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001618 if (s < e)
1619 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001620 }
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001621 if (consumed) {
1622 if(inShift)
1623 *consumed = startinpos;
1624 else
1625 *consumed = s-starts;
1626 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001627
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001628 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001629 goto onError;
1630
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001631 Py_XDECREF(errorHandler);
1632 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001633 return (PyObject *)unicode;
1634
1635onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001636 Py_XDECREF(errorHandler);
1637 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001638 Py_DECREF(unicode);
1639 return NULL;
1640}
1641
1642
1643PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001644 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001645 int encodeSetO,
1646 int encodeWhiteSpace,
1647 const char *errors)
1648{
1649 PyObject *v;
1650 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001651 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001652 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001653 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001654 unsigned int bitsleft = 0;
1655 unsigned long charsleft = 0;
1656 char * out;
1657 char * start;
1658
Neal Norwitze7d8be82008-07-31 17:17:14 +00001659 if (cbAllocated / 5 != size)
1660 return PyErr_NoMemory();
1661
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001662 if (size == 0)
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001663 return PyString_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001664
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001665 v = PyString_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001666 if (v == NULL)
1667 return NULL;
1668
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001669 start = out = PyString_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001670 for (;i < size; ++i) {
1671 Py_UNICODE ch = s[i];
1672
1673 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001674 if (ch == '+') {
1675 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001676 *out++ = '-';
1677 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1678 charsleft = ch;
1679 bitsleft = 16;
1680 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001681 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001682 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001683 } else {
1684 *out++ = (char) ch;
1685 }
1686 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001687 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1688 *out++ = B64(charsleft << (6-bitsleft));
1689 charsleft = 0;
1690 bitsleft = 0;
1691 /* Characters not in the BASE64 set implicitly unshift the sequence
1692 so no '-' is required, except if the character is itself a '-' */
1693 if (B64CHAR(ch) || ch == '-') {
1694 *out++ = '-';
1695 }
1696 inShift = 0;
1697 *out++ = (char) ch;
1698 } else {
1699 bitsleft += 16;
1700 charsleft = (charsleft << 16) | ch;
1701 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1702
1703 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001704 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001705 or '-' then the shift sequence will be terminated implicitly and we
1706 don't have to insert a '-'. */
1707
1708 if (bitsleft == 0) {
1709 if (i + 1 < size) {
1710 Py_UNICODE ch2 = s[i+1];
1711
1712 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001713
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001714 } else if (B64CHAR(ch2) || ch2 == '-') {
1715 *out++ = '-';
1716 inShift = 0;
1717 } else {
1718 inShift = 0;
1719 }
1720
1721 }
1722 else {
1723 *out++ = '-';
1724 inShift = 0;
1725 }
1726 }
Tim Petersced69f82003-09-16 20:30:58 +00001727 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001728 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001729 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001730 if (bitsleft) {
1731 *out++= B64(charsleft << (6-bitsleft) );
1732 *out++ = '-';
1733 }
1734
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001735 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001736 return v;
1737}
1738
1739#undef SPECIAL
1740#undef B64
1741#undef B64CHAR
1742#undef UB64
1743#undef ENCODE
1744#undef DECODE
1745
Guido van Rossumd57fd912000-03-10 22:53:23 +00001746/* --- UTF-8 Codec -------------------------------------------------------- */
1747
Tim Petersced69f82003-09-16 20:30:58 +00001748static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001749char utf8_code_length[256] = {
1750 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1751 illegal prefix. see RFC 2279 for details */
1752 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1753 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1754 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1755 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1756 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1757 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1758 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1759 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1760 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1761 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1762 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1763 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1764 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1765 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1766 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1767 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1768};
1769
Guido van Rossumd57fd912000-03-10 22:53:23 +00001770PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001771 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001772 const char *errors)
1773{
Walter Dörwald69652032004-09-07 20:24:22 +00001774 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1775}
1776
1777PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001778 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001779 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001780 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001781{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001782 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001783 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001784 Py_ssize_t startinpos;
1785 Py_ssize_t endinpos;
1786 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001787 const char *e;
1788 PyUnicodeObject *unicode;
1789 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001790 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001791 PyObject *errorHandler = NULL;
1792 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001793
1794 /* Note: size will always be longer than the resulting Unicode
1795 character count */
1796 unicode = _PyUnicode_New(size);
1797 if (!unicode)
1798 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001799 if (size == 0) {
1800 if (consumed)
1801 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001802 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001803 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001804
1805 /* Unpack UTF-8 encoded data */
1806 p = unicode->str;
1807 e = s + size;
1808
1809 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001810 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001811
1812 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001813 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001814 s++;
1815 continue;
1816 }
1817
1818 n = utf8_code_length[ch];
1819
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001820 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001821 if (consumed)
1822 break;
1823 else {
1824 errmsg = "unexpected end of data";
1825 startinpos = s-starts;
1826 endinpos = size;
1827 goto utf8Error;
1828 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001829 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001830
1831 switch (n) {
1832
1833 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001834 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001835 startinpos = s-starts;
1836 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001837 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001838
1839 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001840 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001841 startinpos = s-starts;
1842 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001843 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001844
1845 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001846 if ((s[1] & 0xc0) != 0x80) {
1847 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001848 startinpos = s-starts;
1849 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001850 goto utf8Error;
1851 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001852 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001853 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001854 startinpos = s-starts;
1855 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001856 errmsg = "illegal encoding";
1857 goto utf8Error;
1858 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001859 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001860 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001861 break;
1862
1863 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001864 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001865 (s[2] & 0xc0) != 0x80) {
1866 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001867 startinpos = s-starts;
1868 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001869 goto utf8Error;
1870 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001871 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001872 if (ch < 0x0800) {
1873 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001874 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001875
1876 XXX For wide builds (UCS-4) we should probably try
1877 to recombine the surrogates into a single code
1878 unit.
1879 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001880 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001881 startinpos = s-starts;
1882 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001883 goto utf8Error;
1884 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001885 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001886 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001887 break;
1888
1889 case 4:
1890 if ((s[1] & 0xc0) != 0x80 ||
1891 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001892 (s[3] & 0xc0) != 0x80) {
1893 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001894 startinpos = s-starts;
1895 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001896 goto utf8Error;
1897 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001898 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1899 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1900 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001901 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001902 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001903 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001904 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001905 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001906 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001907 startinpos = s-starts;
1908 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001909 goto utf8Error;
1910 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001911#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001912 *p++ = (Py_UNICODE)ch;
1913#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001914 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001915
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001916 /* translate from 10000..10FFFF to 0..FFFF */
1917 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001918
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001919 /* high surrogate = top 10 bits added to D800 */
1920 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001921
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001922 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001923 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001924#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001925 break;
1926
1927 default:
1928 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001929 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001930 startinpos = s-starts;
1931 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001932 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001933 }
1934 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001935 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001936
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001937 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001938 outpos = p-PyUnicode_AS_UNICODE(unicode);
1939 if (unicode_decode_call_errorhandler(
1940 errors, &errorHandler,
1941 "utf8", errmsg,
1942 starts, size, &startinpos, &endinpos, &exc, &s,
1943 (PyObject **)&unicode, &outpos, &p))
1944 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001945 }
Walter Dörwald69652032004-09-07 20:24:22 +00001946 if (consumed)
1947 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001948
1949 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001950 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001951 goto onError;
1952
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001953 Py_XDECREF(errorHandler);
1954 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001955 return (PyObject *)unicode;
1956
1957onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001958 Py_XDECREF(errorHandler);
1959 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001960 Py_DECREF(unicode);
1961 return NULL;
1962}
1963
Tim Peters602f7402002-04-27 18:03:26 +00001964/* Allocation strategy: if the string is short, convert into a stack buffer
1965 and allocate exactly as much space needed at the end. Else allocate the
1966 maximum possible needed (4 result bytes per Unicode character), and return
1967 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001968*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001969PyObject *
1970PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001971 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001972 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001973{
Tim Peters602f7402002-04-27 18:03:26 +00001974#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001975
Martin v. Löwis18e16552006-02-15 17:27:45 +00001976 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001977 PyObject *v; /* result string object */
1978 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001979 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001980 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001981 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001982
Tim Peters602f7402002-04-27 18:03:26 +00001983 assert(s != NULL);
1984 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001985
Tim Peters602f7402002-04-27 18:03:26 +00001986 if (size <= MAX_SHORT_UNICHARS) {
1987 /* Write into the stack buffer; nallocated can't overflow.
1988 * At the end, we'll allocate exactly as much heap space as it
1989 * turns out we need.
1990 */
1991 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1992 v = NULL; /* will allocate after we're done */
1993 p = stackbuf;
1994 }
1995 else {
1996 /* Overallocate on the heap, and give the excess back at the end. */
1997 nallocated = size * 4;
1998 if (nallocated / 4 != size) /* overflow! */
1999 return PyErr_NoMemory();
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002000 v = PyString_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00002001 if (v == NULL)
2002 return NULL;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002003 p = PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002004 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002005
Tim Peters602f7402002-04-27 18:03:26 +00002006 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002007 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002008
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002009 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002010 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002011 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002012
Guido van Rossumd57fd912000-03-10 22:53:23 +00002013 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002014 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002015 *p++ = (char)(0xc0 | (ch >> 6));
2016 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002017 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002018 else {
Tim Peters602f7402002-04-27 18:03:26 +00002019 /* Encode UCS2 Unicode ordinals */
2020 if (ch < 0x10000) {
2021 /* Special case: check for high surrogate */
2022 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2023 Py_UCS4 ch2 = s[i];
2024 /* Check for low surrogate and combine the two to
2025 form a UCS4 value */
2026 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002027 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002028 i++;
2029 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002030 }
Tim Peters602f7402002-04-27 18:03:26 +00002031 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002032 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002033 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002034 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2035 *p++ = (char)(0x80 | (ch & 0x3f));
2036 continue;
2037 }
2038encodeUCS4:
2039 /* Encode UCS4 Unicode ordinals */
2040 *p++ = (char)(0xf0 | (ch >> 18));
2041 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2042 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2043 *p++ = (char)(0x80 | (ch & 0x3f));
2044 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002045 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002046
Tim Peters602f7402002-04-27 18:03:26 +00002047 if (v == NULL) {
2048 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002049 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002050 assert(nneeded <= nallocated);
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002051 v = PyString_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002052 }
2053 else {
2054 /* Cut back to size actually needed. */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002055 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002056 assert(nneeded <= nallocated);
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002057 _PyString_Resize(&v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002058 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002059 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002060
Tim Peters602f7402002-04-27 18:03:26 +00002061#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002062}
2063
Guido van Rossumd57fd912000-03-10 22:53:23 +00002064PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2065{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002066 if (!PyUnicode_Check(unicode)) {
2067 PyErr_BadArgument();
2068 return NULL;
2069 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002070 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2071 PyUnicode_GET_SIZE(unicode),
2072 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002073}
2074
Walter Dörwald6e390802007-08-17 16:41:28 +00002075/* --- UTF-32 Codec ------------------------------------------------------- */
2076
2077PyObject *
2078PyUnicode_DecodeUTF32(const char *s,
2079 Py_ssize_t size,
2080 const char *errors,
2081 int *byteorder)
2082{
2083 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2084}
2085
2086PyObject *
2087PyUnicode_DecodeUTF32Stateful(const char *s,
2088 Py_ssize_t size,
2089 const char *errors,
2090 int *byteorder,
2091 Py_ssize_t *consumed)
2092{
2093 const char *starts = s;
2094 Py_ssize_t startinpos;
2095 Py_ssize_t endinpos;
2096 Py_ssize_t outpos;
2097 PyUnicodeObject *unicode;
2098 Py_UNICODE *p;
2099#ifndef Py_UNICODE_WIDE
2100 int i, pairs;
2101#else
2102 const int pairs = 0;
2103#endif
2104 const unsigned char *q, *e;
2105 int bo = 0; /* assume native ordering by default */
2106 const char *errmsg = "";
Walter Dörwald20b40d32007-08-17 16:52:50 +00002107 /* Offsets from q for retrieving bytes in the right order. */
2108#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2109 int iorder[] = {0, 1, 2, 3};
2110#else
2111 int iorder[] = {3, 2, 1, 0};
2112#endif
Walter Dörwald9ab80a92007-08-17 16:58:43 +00002113 PyObject *errorHandler = NULL;
2114 PyObject *exc = NULL;
Walter Dörwald6e390802007-08-17 16:41:28 +00002115 /* On narrow builds we split characters outside the BMP into two
2116 codepoints => count how much extra space we need. */
2117#ifndef Py_UNICODE_WIDE
2118 for (i = pairs = 0; i < size/4; i++)
2119 if (((Py_UCS4 *)s)[i] >= 0x10000)
2120 pairs++;
2121#endif
Walter Dörwald6e390802007-08-17 16:41:28 +00002122
2123 /* This might be one to much, because of a BOM */
2124 unicode = _PyUnicode_New((size+3)/4+pairs);
2125 if (!unicode)
2126 return NULL;
2127 if (size == 0)
2128 return (PyObject *)unicode;
2129
2130 /* Unpack UTF-32 encoded data */
2131 p = unicode->str;
2132 q = (unsigned char *)s;
2133 e = q + size;
2134
2135 if (byteorder)
2136 bo = *byteorder;
2137
2138 /* Check for BOM marks (U+FEFF) in the input and adjust current
2139 byte order setting accordingly. In native mode, the leading BOM
2140 mark is skipped, in all other modes, it is copied to the output
2141 stream as-is (giving a ZWNBSP character). */
2142 if (bo == 0) {
2143 if (size >= 4) {
2144 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2145 (q[iorder[1]] << 8) | q[iorder[0]];
2146#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2147 if (bom == 0x0000FEFF) {
2148 q += 4;
2149 bo = -1;
2150 }
2151 else if (bom == 0xFFFE0000) {
2152 q += 4;
2153 bo = 1;
2154 }
2155#else
2156 if (bom == 0x0000FEFF) {
2157 q += 4;
2158 bo = 1;
2159 }
2160 else if (bom == 0xFFFE0000) {
2161 q += 4;
2162 bo = -1;
2163 }
2164#endif
2165 }
2166 }
2167
2168 if (bo == -1) {
2169 /* force LE */
2170 iorder[0] = 0;
2171 iorder[1] = 1;
2172 iorder[2] = 2;
2173 iorder[3] = 3;
2174 }
2175 else if (bo == 1) {
2176 /* force BE */
2177 iorder[0] = 3;
2178 iorder[1] = 2;
2179 iorder[2] = 1;
2180 iorder[3] = 0;
2181 }
2182
2183 while (q < e) {
2184 Py_UCS4 ch;
2185 /* remaining bytes at the end? (size should be divisible by 4) */
2186 if (e-q<4) {
2187 if (consumed)
2188 break;
2189 errmsg = "truncated data";
2190 startinpos = ((const char *)q)-starts;
2191 endinpos = ((const char *)e)-starts;
2192 goto utf32Error;
2193 /* The remaining input chars are ignored if the callback
2194 chooses to skip the input */
2195 }
2196 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2197 (q[iorder[1]] << 8) | q[iorder[0]];
2198
2199 if (ch >= 0x110000)
2200 {
2201 errmsg = "codepoint not in range(0x110000)";
2202 startinpos = ((const char *)q)-starts;
2203 endinpos = startinpos+4;
2204 goto utf32Error;
2205 }
2206#ifndef Py_UNICODE_WIDE
2207 if (ch >= 0x10000)
2208 {
2209 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2210 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2211 }
2212 else
2213#endif
2214 *p++ = ch;
2215 q += 4;
2216 continue;
2217 utf32Error:
2218 outpos = p-PyUnicode_AS_UNICODE(unicode);
2219 if (unicode_decode_call_errorhandler(
2220 errors, &errorHandler,
2221 "utf32", errmsg,
2222 starts, size, &startinpos, &endinpos, &exc, &s,
2223 (PyObject **)&unicode, &outpos, &p))
2224 goto onError;
2225 }
2226
2227 if (byteorder)
2228 *byteorder = bo;
2229
2230 if (consumed)
2231 *consumed = (const char *)q-starts;
2232
2233 /* Adjust length */
2234 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2235 goto onError;
2236
2237 Py_XDECREF(errorHandler);
2238 Py_XDECREF(exc);
2239 return (PyObject *)unicode;
2240
2241onError:
2242 Py_DECREF(unicode);
2243 Py_XDECREF(errorHandler);
2244 Py_XDECREF(exc);
2245 return NULL;
2246}
2247
2248PyObject *
2249PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2250 Py_ssize_t size,
2251 const char *errors,
2252 int byteorder)
2253{
2254 PyObject *v;
2255 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002256 Py_ssize_t nsize, bytesize;
Walter Dörwald6e390802007-08-17 16:41:28 +00002257#ifndef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002258 Py_ssize_t i, pairs;
Walter Dörwald6e390802007-08-17 16:41:28 +00002259#else
2260 const int pairs = 0;
2261#endif
2262 /* Offsets from p for storing byte pairs in the right order. */
2263#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2264 int iorder[] = {0, 1, 2, 3};
2265#else
2266 int iorder[] = {3, 2, 1, 0};
2267#endif
2268
2269#define STORECHAR(CH) \
2270 do { \
2271 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2272 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2273 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2274 p[iorder[0]] = (CH) & 0xff; \
2275 p += 4; \
2276 } while(0)
2277
2278 /* In narrow builds we can output surrogate pairs as one codepoint,
2279 so we need less space. */
2280#ifndef Py_UNICODE_WIDE
2281 for (i = pairs = 0; i < size-1; i++)
2282 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2283 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2284 pairs++;
2285#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002286 nsize = (size - pairs + (byteorder == 0));
2287 bytesize = nsize * 4;
2288 if (bytesize / 4 != nsize)
2289 return PyErr_NoMemory();
2290 v = PyString_FromStringAndSize(NULL, bytesize);
Walter Dörwald6e390802007-08-17 16:41:28 +00002291 if (v == NULL)
2292 return NULL;
2293
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002294 p = (unsigned char *)PyString_AS_STRING(v);
Walter Dörwald6e390802007-08-17 16:41:28 +00002295 if (byteorder == 0)
2296 STORECHAR(0xFEFF);
2297 if (size == 0)
2298 return v;
2299
2300 if (byteorder == -1) {
2301 /* force LE */
2302 iorder[0] = 0;
2303 iorder[1] = 1;
2304 iorder[2] = 2;
2305 iorder[3] = 3;
2306 }
2307 else if (byteorder == 1) {
2308 /* force BE */
2309 iorder[0] = 3;
2310 iorder[1] = 2;
2311 iorder[2] = 1;
2312 iorder[3] = 0;
2313 }
2314
2315 while (size-- > 0) {
2316 Py_UCS4 ch = *s++;
2317#ifndef Py_UNICODE_WIDE
2318 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2319 Py_UCS4 ch2 = *s;
2320 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2321 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2322 s++;
2323 size--;
2324 }
2325 }
2326#endif
2327 STORECHAR(ch);
2328 }
2329 return v;
2330#undef STORECHAR
2331}
2332
2333PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2334{
2335 if (!PyUnicode_Check(unicode)) {
2336 PyErr_BadArgument();
2337 return NULL;
2338 }
2339 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2340 PyUnicode_GET_SIZE(unicode),
2341 NULL,
2342 0);
2343}
2344
Guido van Rossumd57fd912000-03-10 22:53:23 +00002345/* --- UTF-16 Codec ------------------------------------------------------- */
2346
Tim Peters772747b2001-08-09 22:21:55 +00002347PyObject *
2348PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002349 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002350 const char *errors,
2351 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002352{
Walter Dörwald69652032004-09-07 20:24:22 +00002353 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2354}
2355
2356PyObject *
2357PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002358 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002359 const char *errors,
2360 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002361 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002362{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002363 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002364 Py_ssize_t startinpos;
2365 Py_ssize_t endinpos;
2366 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002367 PyUnicodeObject *unicode;
2368 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002369 const unsigned char *q, *e;
2370 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002371 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002372 /* Offsets from q for retrieving byte pairs in the right order. */
2373#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2374 int ihi = 1, ilo = 0;
2375#else
2376 int ihi = 0, ilo = 1;
2377#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002378 PyObject *errorHandler = NULL;
2379 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002380
2381 /* Note: size will always be longer than the resulting Unicode
2382 character count */
2383 unicode = _PyUnicode_New(size);
2384 if (!unicode)
2385 return NULL;
2386 if (size == 0)
2387 return (PyObject *)unicode;
2388
2389 /* Unpack UTF-16 encoded data */
2390 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002391 q = (unsigned char *)s;
2392 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002393
2394 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002395 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002396
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002397 /* Check for BOM marks (U+FEFF) in the input and adjust current
2398 byte order setting accordingly. In native mode, the leading BOM
2399 mark is skipped, in all other modes, it is copied to the output
2400 stream as-is (giving a ZWNBSP character). */
2401 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002402 if (size >= 2) {
2403 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002404#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002405 if (bom == 0xFEFF) {
2406 q += 2;
2407 bo = -1;
2408 }
2409 else if (bom == 0xFFFE) {
2410 q += 2;
2411 bo = 1;
2412 }
Tim Petersced69f82003-09-16 20:30:58 +00002413#else
Walter Dörwald69652032004-09-07 20:24:22 +00002414 if (bom == 0xFEFF) {
2415 q += 2;
2416 bo = 1;
2417 }
2418 else if (bom == 0xFFFE) {
2419 q += 2;
2420 bo = -1;
2421 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002422#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002423 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002424 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002425
Tim Peters772747b2001-08-09 22:21:55 +00002426 if (bo == -1) {
2427 /* force LE */
2428 ihi = 1;
2429 ilo = 0;
2430 }
2431 else if (bo == 1) {
2432 /* force BE */
2433 ihi = 0;
2434 ilo = 1;
2435 }
2436
2437 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002438 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002439 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002440 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002441 if (consumed)
2442 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002443 errmsg = "truncated data";
2444 startinpos = ((const char *)q)-starts;
2445 endinpos = ((const char *)e)-starts;
2446 goto utf16Error;
2447 /* The remaining input chars are ignored if the callback
2448 chooses to skip the input */
2449 }
2450 ch = (q[ihi] << 8) | q[ilo];
2451
Tim Peters772747b2001-08-09 22:21:55 +00002452 q += 2;
2453
Guido van Rossumd57fd912000-03-10 22:53:23 +00002454 if (ch < 0xD800 || ch > 0xDFFF) {
2455 *p++ = ch;
2456 continue;
2457 }
2458
2459 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002460 if (q >= e) {
2461 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002462 startinpos = (((const char *)q)-2)-starts;
2463 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002464 goto utf16Error;
2465 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002466 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002467 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2468 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002469 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002470#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002471 *p++ = ch;
2472 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002473#else
2474 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002475#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002476 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002477 }
2478 else {
2479 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002480 startinpos = (((const char *)q)-4)-starts;
2481 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002482 goto utf16Error;
2483 }
2484
Guido van Rossumd57fd912000-03-10 22:53:23 +00002485 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002486 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002487 startinpos = (((const char *)q)-2)-starts;
2488 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002489 /* Fall through to report the error */
2490
2491 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002492 outpos = p-PyUnicode_AS_UNICODE(unicode);
2493 if (unicode_decode_call_errorhandler(
2494 errors, &errorHandler,
2495 "utf16", errmsg,
2496 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2497 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002498 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002499 }
2500
2501 if (byteorder)
2502 *byteorder = bo;
2503
Walter Dörwald69652032004-09-07 20:24:22 +00002504 if (consumed)
2505 *consumed = (const char *)q-starts;
2506
Guido van Rossumd57fd912000-03-10 22:53:23 +00002507 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002508 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002509 goto onError;
2510
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002511 Py_XDECREF(errorHandler);
2512 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002513 return (PyObject *)unicode;
2514
2515onError:
2516 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002517 Py_XDECREF(errorHandler);
2518 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002519 return NULL;
2520}
2521
Tim Peters772747b2001-08-09 22:21:55 +00002522PyObject *
2523PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002524 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002525 const char *errors,
2526 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002527{
2528 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002529 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002530 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002531#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002532 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002533#else
2534 const int pairs = 0;
2535#endif
Tim Peters772747b2001-08-09 22:21:55 +00002536 /* Offsets from p for storing byte pairs in the right order. */
2537#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2538 int ihi = 1, ilo = 0;
2539#else
2540 int ihi = 0, ilo = 1;
2541#endif
2542
2543#define STORECHAR(CH) \
2544 do { \
2545 p[ihi] = ((CH) >> 8) & 0xff; \
2546 p[ilo] = (CH) & 0xff; \
2547 p += 2; \
2548 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002549
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002550#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002551 for (i = pairs = 0; i < size; i++)
2552 if (s[i] >= 0x10000)
2553 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002554#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002555 /* 2 * (size + pairs + (byteorder == 0)) */
2556 if (size > PY_SSIZE_T_MAX ||
2557 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
2558 return PyErr_NoMemory();
2559 nsize = size + pairs + (byteorder == 0);
2560 bytesize = nsize * 2;
2561 if (bytesize / 2 != nsize)
2562 return PyErr_NoMemory();
2563 v = PyString_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002564 if (v == NULL)
2565 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002566
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002567 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002568 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002569 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002570 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002571 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002572
2573 if (byteorder == -1) {
2574 /* force LE */
2575 ihi = 1;
2576 ilo = 0;
2577 }
2578 else if (byteorder == 1) {
2579 /* force BE */
2580 ihi = 0;
2581 ilo = 1;
2582 }
2583
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002584 while (size-- > 0) {
2585 Py_UNICODE ch = *s++;
2586 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002587#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002588 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002589 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2590 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002591 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002592#endif
Tim Peters772747b2001-08-09 22:21:55 +00002593 STORECHAR(ch);
2594 if (ch2)
2595 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002596 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002597 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002598#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002599}
2600
2601PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2602{
2603 if (!PyUnicode_Check(unicode)) {
2604 PyErr_BadArgument();
2605 return NULL;
2606 }
2607 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2608 PyUnicode_GET_SIZE(unicode),
2609 NULL,
2610 0);
2611}
2612
2613/* --- Unicode Escape Codec ----------------------------------------------- */
2614
Fredrik Lundh06d12682001-01-24 07:59:11 +00002615static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002616
Guido van Rossumd57fd912000-03-10 22:53:23 +00002617PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002618 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002619 const char *errors)
2620{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002621 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002622 Py_ssize_t startinpos;
2623 Py_ssize_t endinpos;
2624 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002625 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002626 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002627 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002628 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002629 char* message;
2630 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002631 PyObject *errorHandler = NULL;
2632 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002633
Guido van Rossumd57fd912000-03-10 22:53:23 +00002634 /* Escaped strings will always be longer than the resulting
2635 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002636 length after conversion to the true value.
2637 (but if the error callback returns a long replacement string
2638 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002639 v = _PyUnicode_New(size);
2640 if (v == NULL)
2641 goto onError;
2642 if (size == 0)
2643 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002644
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002645 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002646 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002647
Guido van Rossumd57fd912000-03-10 22:53:23 +00002648 while (s < end) {
2649 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002650 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002651 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002652
2653 /* Non-escape characters are interpreted as Unicode ordinals */
2654 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002655 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002656 continue;
2657 }
2658
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002659 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002660 /* \ - Escapes */
2661 s++;
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002662 c = *s++;
2663 if (s > end)
2664 c = '\0'; /* Invalid after \ */
2665 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002666
2667 /* \x escapes */
2668 case '\n': break;
2669 case '\\': *p++ = '\\'; break;
2670 case '\'': *p++ = '\''; break;
2671 case '\"': *p++ = '\"'; break;
2672 case 'b': *p++ = '\b'; break;
2673 case 'f': *p++ = '\014'; break; /* FF */
2674 case 't': *p++ = '\t'; break;
2675 case 'n': *p++ = '\n'; break;
2676 case 'r': *p++ = '\r'; break;
2677 case 'v': *p++ = '\013'; break; /* VT */
2678 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2679
2680 /* \OOO (octal) escapes */
2681 case '0': case '1': case '2': case '3':
2682 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002683 x = s[-1] - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002684 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002685 x = (x<<3) + *s++ - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002686 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002687 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002688 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002689 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002690 break;
2691
Fredrik Lundhccc74732001-02-18 22:13:49 +00002692 /* hex escapes */
2693 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002694 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002695 digits = 2;
2696 message = "truncated \\xXX escape";
2697 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002698
Fredrik Lundhccc74732001-02-18 22:13:49 +00002699 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002700 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002701 digits = 4;
2702 message = "truncated \\uXXXX escape";
2703 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002704
Fredrik Lundhccc74732001-02-18 22:13:49 +00002705 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002706 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002707 digits = 8;
2708 message = "truncated \\UXXXXXXXX escape";
2709 hexescape:
2710 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002711 outpos = p-PyUnicode_AS_UNICODE(v);
2712 if (s+digits>end) {
2713 endinpos = size;
2714 if (unicode_decode_call_errorhandler(
2715 errors, &errorHandler,
2716 "unicodeescape", "end of string in escape sequence",
2717 starts, size, &startinpos, &endinpos, &exc, &s,
2718 (PyObject **)&v, &outpos, &p))
2719 goto onError;
2720 goto nextByte;
2721 }
2722 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002723 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002724 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002725 endinpos = (s+i+1)-starts;
2726 if (unicode_decode_call_errorhandler(
2727 errors, &errorHandler,
2728 "unicodeescape", message,
2729 starts, size, &startinpos, &endinpos, &exc, &s,
2730 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002731 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002732 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002733 }
2734 chr = (chr<<4) & ~0xF;
2735 if (c >= '0' && c <= '9')
2736 chr += c - '0';
2737 else if (c >= 'a' && c <= 'f')
2738 chr += 10 + c - 'a';
2739 else
2740 chr += 10 + c - 'A';
2741 }
2742 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002743 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002744 /* _decoding_error will have already written into the
2745 target buffer. */
2746 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002747 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002748 /* when we get here, chr is a 32-bit unicode character */
2749 if (chr <= 0xffff)
2750 /* UCS-2 character */
2751 *p++ = (Py_UNICODE) chr;
2752 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002753 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002754 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002755#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002756 *p++ = chr;
2757#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002758 chr -= 0x10000L;
2759 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002760 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002761#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002762 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002763 endinpos = s-starts;
2764 outpos = p-PyUnicode_AS_UNICODE(v);
2765 if (unicode_decode_call_errorhandler(
2766 errors, &errorHandler,
2767 "unicodeescape", "illegal Unicode character",
2768 starts, size, &startinpos, &endinpos, &exc, &s,
2769 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002770 goto onError;
2771 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002772 break;
2773
2774 /* \N{name} */
2775 case 'N':
2776 message = "malformed \\N character escape";
2777 if (ucnhash_CAPI == NULL) {
2778 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002779 PyObject *m, *api;
Christian Heimes000a0742008-01-03 22:16:32 +00002780 m = PyImport_ImportModuleNoBlock("unicodedata");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002781 if (m == NULL)
2782 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002783 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002784 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002785 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002786 goto ucnhashError;
Anthony Baxtera6286212006-04-11 07:42:36 +00002787 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002788 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002789 if (ucnhash_CAPI == NULL)
2790 goto ucnhashError;
2791 }
2792 if (*s == '{') {
2793 const char *start = s+1;
2794 /* look for the closing brace */
2795 while (*s != '}' && s < end)
2796 s++;
2797 if (s > start && s < end && *s == '}') {
2798 /* found a name. look it up in the unicode database */
2799 message = "unknown Unicode character name";
2800 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002801 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002802 goto store;
2803 }
2804 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002805 endinpos = s-starts;
2806 outpos = p-PyUnicode_AS_UNICODE(v);
2807 if (unicode_decode_call_errorhandler(
2808 errors, &errorHandler,
2809 "unicodeescape", message,
2810 starts, size, &startinpos, &endinpos, &exc, &s,
2811 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002812 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002813 break;
2814
2815 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002816 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002817 message = "\\ at end of string";
2818 s--;
2819 endinpos = s-starts;
2820 outpos = p-PyUnicode_AS_UNICODE(v);
2821 if (unicode_decode_call_errorhandler(
2822 errors, &errorHandler,
2823 "unicodeescape", message,
2824 starts, size, &startinpos, &endinpos, &exc, &s,
2825 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002826 goto onError;
2827 }
2828 else {
2829 *p++ = '\\';
2830 *p++ = (unsigned char)s[-1];
2831 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002832 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002833 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002834 nextByte:
2835 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002836 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002837 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002838 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002839 Py_XDECREF(errorHandler);
2840 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002841 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002842
Fredrik Lundhccc74732001-02-18 22:13:49 +00002843ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002844 PyErr_SetString(
2845 PyExc_UnicodeError,
2846 "\\N escapes not supported (can't load unicodedata module)"
2847 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002848 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002849 Py_XDECREF(errorHandler);
2850 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002851 return NULL;
2852
Fredrik Lundhccc74732001-02-18 22:13:49 +00002853onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002854 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002855 Py_XDECREF(errorHandler);
2856 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002857 return NULL;
2858}
2859
2860/* Return a Unicode-Escape string version of the Unicode object.
2861
2862 If quotes is true, the string is enclosed in u"" or u'' quotes as
2863 appropriate.
2864
2865*/
2866
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00002867Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Fredrik Lundh95e2a912006-05-26 11:38:15 +00002868 Py_ssize_t size,
2869 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00002870{
2871 /* like wcschr, but doesn't stop at NULL characters */
2872
2873 while (size-- > 0) {
2874 if (*s == ch)
2875 return s;
2876 s++;
2877 }
2878
2879 return NULL;
2880}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002881
Guido van Rossumd57fd912000-03-10 22:53:23 +00002882static
2883PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002884 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002885 int quotes)
2886{
2887 PyObject *repr;
2888 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002889
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002890 static const char *hexdigit = "0123456789abcdef";
Neal Norwitze7d8be82008-07-31 17:17:14 +00002891#ifdef Py_UNICODE_WIDE
2892 const Py_ssize_t expandsize = 10;
2893#else
2894 const Py_ssize_t expandsize = 6;
2895#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002896
Neal Norwitz17753ec2006-08-21 22:21:19 +00002897 /* XXX(nnorwitz): rather than over-allocating, it would be
2898 better to choose a different scheme. Perhaps scan the
2899 first N-chars of the string and allocate based on that size.
2900 */
2901 /* Initial allocation is based on the longest-possible unichr
2902 escape.
2903
2904 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2905 unichr, so in this case it's the longest unichr escape. In
2906 narrow (UTF-16) builds this is five chars per source unichr
2907 since there are two unichrs in the surrogate pair, so in narrow
2908 (UTF-16) builds it's not the longest unichr escape.
2909
2910 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2911 so in the narrow (UTF-16) build case it's the longest unichr
2912 escape.
2913 */
2914
Neal Norwitze7d8be82008-07-31 17:17:14 +00002915 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
2916 return PyErr_NoMemory();
2917
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002918 repr = PyString_FromStringAndSize(NULL,
Neal Norwitz17753ec2006-08-21 22:21:19 +00002919 2
Neal Norwitze7d8be82008-07-31 17:17:14 +00002920 + expandsize*size
Neal Norwitz17753ec2006-08-21 22:21:19 +00002921 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002922 if (repr == NULL)
2923 return NULL;
2924
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002925 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002926
2927 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002928 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00002929 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00002930 !findchar(s, size, '"')) ? '"' : '\'';
2931 }
2932 while (size-- > 0) {
2933 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002934
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002935 /* Escape quotes and backslashes */
2936 if ((quotes &&
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002937 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002938 *p++ = '\\';
2939 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00002940 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002941 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002942
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002943#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002944 /* Map 21-bit characters to '\U00xxxxxx' */
2945 else if (ch >= 0x10000) {
2946 *p++ = '\\';
2947 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002948 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2949 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2950 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2951 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2952 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2953 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2954 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002955 *p++ = hexdigit[ch & 0x0000000F];
2956 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002957 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00002958#else
2959 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002960 else if (ch >= 0xD800 && ch < 0xDC00) {
2961 Py_UNICODE ch2;
2962 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002963
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002964 ch2 = *s++;
2965 size--;
2966 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2967 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2968 *p++ = '\\';
2969 *p++ = 'U';
2970 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2971 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2972 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2973 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2974 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2975 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2976 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2977 *p++ = hexdigit[ucs & 0x0000000F];
2978 continue;
2979 }
2980 /* Fall through: isolated surrogates are copied as-is */
2981 s--;
2982 size++;
2983 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00002984#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002985
Guido van Rossumd57fd912000-03-10 22:53:23 +00002986 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002987 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002988 *p++ = '\\';
2989 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002990 *p++ = hexdigit[(ch >> 12) & 0x000F];
2991 *p++ = hexdigit[(ch >> 8) & 0x000F];
2992 *p++ = hexdigit[(ch >> 4) & 0x000F];
2993 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002994 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002995
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002996 /* Map special whitespace to '\t', \n', '\r' */
2997 else if (ch == '\t') {
2998 *p++ = '\\';
2999 *p++ = 't';
3000 }
3001 else if (ch == '\n') {
3002 *p++ = '\\';
3003 *p++ = 'n';
3004 }
3005 else if (ch == '\r') {
3006 *p++ = '\\';
3007 *p++ = 'r';
3008 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003009
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003010 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003011 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003012 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003013 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003014 *p++ = hexdigit[(ch >> 4) & 0x000F];
3015 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003016 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003017
Guido van Rossumd57fd912000-03-10 22:53:23 +00003018 /* Copy everything else as-is */
3019 else
3020 *p++ = (char) ch;
3021 }
3022 if (quotes)
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003023 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003024
3025 *p = '\0';
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003026 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003027 return repr;
3028}
3029
3030PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003031 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003032{
3033 return unicodeescape_string(s, size, 0);
3034}
3035
3036PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3037{
3038 if (!PyUnicode_Check(unicode)) {
3039 PyErr_BadArgument();
3040 return NULL;
3041 }
3042 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3043 PyUnicode_GET_SIZE(unicode));
3044}
3045
3046/* --- Raw Unicode Escape Codec ------------------------------------------- */
3047
3048PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003049 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003050 const char *errors)
3051{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003052 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003053 Py_ssize_t startinpos;
3054 Py_ssize_t endinpos;
3055 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003056 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003057 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003058 const char *end;
3059 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003060 PyObject *errorHandler = NULL;
3061 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003062
Guido van Rossumd57fd912000-03-10 22:53:23 +00003063 /* Escaped strings will always be longer than the resulting
3064 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003065 length after conversion to the true value. (But decoding error
3066 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003067 v = _PyUnicode_New(size);
3068 if (v == NULL)
3069 goto onError;
3070 if (size == 0)
3071 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003072 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003073 end = s + size;
3074 while (s < end) {
3075 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00003076 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003077 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003078 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003079
3080 /* Non-escape characters are interpreted as Unicode ordinals */
3081 if (*s != '\\') {
3082 *p++ = (unsigned char)*s++;
3083 continue;
3084 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003085 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003086
3087 /* \u-escapes are only interpreted iff the number of leading
3088 backslashes if odd */
3089 bs = s;
3090 for (;s < end;) {
3091 if (*s != '\\')
3092 break;
3093 *p++ = (unsigned char)*s++;
3094 }
3095 if (((s - bs) & 1) == 0 ||
3096 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003097 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003098 continue;
3099 }
3100 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003101 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003102 s++;
3103
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003104 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003105 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003106 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003107 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003108 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003109 endinpos = s-starts;
3110 if (unicode_decode_call_errorhandler(
3111 errors, &errorHandler,
3112 "rawunicodeescape", "truncated \\uXXXX",
3113 starts, size, &startinpos, &endinpos, &exc, &s,
3114 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003115 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003116 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003117 }
3118 x = (x<<4) & ~0xF;
3119 if (c >= '0' && c <= '9')
3120 x += c - '0';
3121 else if (c >= 'a' && c <= 'f')
3122 x += 10 + c - 'a';
3123 else
3124 x += 10 + c - 'A';
3125 }
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003126 if (x <= 0xffff)
3127 /* UCS-2 character */
3128 *p++ = (Py_UNICODE) x;
3129 else if (x <= 0x10ffff) {
3130 /* UCS-4 character. Either store directly, or as
3131 surrogate pair. */
3132#ifdef Py_UNICODE_WIDE
Amaury Forgeot d'Arcfac02fa2008-03-24 21:04:10 +00003133 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003134#else
3135 x -= 0x10000L;
3136 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3137 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3138#endif
3139 } else {
3140 endinpos = s-starts;
3141 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003142 if (unicode_decode_call_errorhandler(
3143 errors, &errorHandler,
3144 "rawunicodeescape", "\\Uxxxxxxxx out of range",
3145 starts, size, &startinpos, &endinpos, &exc, &s,
3146 (PyObject **)&v, &outpos, &p))
3147 goto onError;
3148 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003149 nextByte:
3150 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003151 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00003152 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003153 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003154 Py_XDECREF(errorHandler);
3155 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003156 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003157
Guido van Rossumd57fd912000-03-10 22:53:23 +00003158 onError:
3159 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003160 Py_XDECREF(errorHandler);
3161 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003162 return NULL;
3163}
3164
3165PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003166 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003167{
3168 PyObject *repr;
3169 char *p;
3170 char *q;
3171
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003172 static const char *hexdigit = "0123456789abcdef";
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003173#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00003174 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003175#else
Neal Norwitze7d8be82008-07-31 17:17:14 +00003176 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003177#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00003178
3179 if (size > PY_SSIZE_T_MAX / expandsize)
3180 return PyErr_NoMemory();
3181
3182 repr = PyString_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003183 if (repr == NULL)
3184 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003185 if (size == 0)
3186 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003187
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003188 p = q = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003189 while (size-- > 0) {
3190 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003191#ifdef Py_UNICODE_WIDE
3192 /* Map 32-bit characters to '\Uxxxxxxxx' */
3193 if (ch >= 0x10000) {
3194 *p++ = '\\';
3195 *p++ = 'U';
3196 *p++ = hexdigit[(ch >> 28) & 0xf];
3197 *p++ = hexdigit[(ch >> 24) & 0xf];
3198 *p++ = hexdigit[(ch >> 20) & 0xf];
3199 *p++ = hexdigit[(ch >> 16) & 0xf];
3200 *p++ = hexdigit[(ch >> 12) & 0xf];
3201 *p++ = hexdigit[(ch >> 8) & 0xf];
3202 *p++ = hexdigit[(ch >> 4) & 0xf];
3203 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003204 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003205 else
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003206#else
3207 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3208 if (ch >= 0xD800 && ch < 0xDC00) {
3209 Py_UNICODE ch2;
3210 Py_UCS4 ucs;
3211
3212 ch2 = *s++;
3213 size--;
3214 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3215 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3216 *p++ = '\\';
3217 *p++ = 'U';
3218 *p++ = hexdigit[(ucs >> 28) & 0xf];
3219 *p++ = hexdigit[(ucs >> 24) & 0xf];
3220 *p++ = hexdigit[(ucs >> 20) & 0xf];
3221 *p++ = hexdigit[(ucs >> 16) & 0xf];
3222 *p++ = hexdigit[(ucs >> 12) & 0xf];
3223 *p++ = hexdigit[(ucs >> 8) & 0xf];
3224 *p++ = hexdigit[(ucs >> 4) & 0xf];
3225 *p++ = hexdigit[ucs & 0xf];
3226 continue;
3227 }
3228 /* Fall through: isolated surrogates are copied as-is */
3229 s--;
3230 size++;
3231 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003232#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003233 /* Map 16-bit characters to '\uxxxx' */
3234 if (ch >= 256) {
3235 *p++ = '\\';
3236 *p++ = 'u';
3237 *p++ = hexdigit[(ch >> 12) & 0xf];
3238 *p++ = hexdigit[(ch >> 8) & 0xf];
3239 *p++ = hexdigit[(ch >> 4) & 0xf];
3240 *p++ = hexdigit[ch & 15];
3241 }
3242 /* Copy everything else as-is */
3243 else
3244 *p++ = (char) ch;
3245 }
3246 *p = '\0';
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003247 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003248 return repr;
3249}
3250
3251PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3252{
3253 if (!PyUnicode_Check(unicode)) {
3254 PyErr_BadArgument();
3255 return NULL;
3256 }
3257 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3258 PyUnicode_GET_SIZE(unicode));
3259}
3260
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003261/* --- Unicode Internal Codec ------------------------------------------- */
3262
3263PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003264 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003265 const char *errors)
3266{
3267 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003268 Py_ssize_t startinpos;
3269 Py_ssize_t endinpos;
3270 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003271 PyUnicodeObject *v;
3272 Py_UNICODE *p;
3273 const char *end;
3274 const char *reason;
3275 PyObject *errorHandler = NULL;
3276 PyObject *exc = NULL;
3277
Neal Norwitzd43069c2006-01-08 01:12:10 +00003278#ifdef Py_UNICODE_WIDE
3279 Py_UNICODE unimax = PyUnicode_GetMax();
3280#endif
3281
Armin Rigo7ccbca92006-10-04 12:17:45 +00003282 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003283 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3284 if (v == NULL)
3285 goto onError;
3286 if (PyUnicode_GetSize((PyObject *)v) == 0)
3287 return (PyObject *)v;
3288 p = PyUnicode_AS_UNICODE(v);
3289 end = s + size;
3290
3291 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00003292 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003293 /* We have to sanity check the raw data, otherwise doom looms for
3294 some malformed UCS-4 data. */
3295 if (
3296 #ifdef Py_UNICODE_WIDE
3297 *p > unimax || *p < 0 ||
3298 #endif
3299 end-s < Py_UNICODE_SIZE
3300 )
3301 {
3302 startinpos = s - starts;
3303 if (end-s < Py_UNICODE_SIZE) {
3304 endinpos = end-starts;
3305 reason = "truncated input";
3306 }
3307 else {
3308 endinpos = s - starts + Py_UNICODE_SIZE;
3309 reason = "illegal code point (> 0x10FFFF)";
3310 }
3311 outpos = p - PyUnicode_AS_UNICODE(v);
3312 if (unicode_decode_call_errorhandler(
3313 errors, &errorHandler,
3314 "unicode_internal", reason,
3315 starts, size, &startinpos, &endinpos, &exc, &s,
3316 (PyObject **)&v, &outpos, &p)) {
3317 goto onError;
3318 }
3319 }
3320 else {
3321 p++;
3322 s += Py_UNICODE_SIZE;
3323 }
3324 }
3325
Martin v. Löwis412fb672006-04-13 06:34:32 +00003326 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003327 goto onError;
3328 Py_XDECREF(errorHandler);
3329 Py_XDECREF(exc);
3330 return (PyObject *)v;
3331
3332 onError:
3333 Py_XDECREF(v);
3334 Py_XDECREF(errorHandler);
3335 Py_XDECREF(exc);
3336 return NULL;
3337}
3338
Guido van Rossumd57fd912000-03-10 22:53:23 +00003339/* --- Latin-1 Codec ------------------------------------------------------ */
3340
3341PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003342 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003343 const char *errors)
3344{
3345 PyUnicodeObject *v;
3346 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003347
Guido van Rossumd57fd912000-03-10 22:53:23 +00003348 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003349 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003350 Py_UNICODE r = *(unsigned char*)s;
3351 return PyUnicode_FromUnicode(&r, 1);
3352 }
3353
Guido van Rossumd57fd912000-03-10 22:53:23 +00003354 v = _PyUnicode_New(size);
3355 if (v == NULL)
3356 goto onError;
3357 if (size == 0)
3358 return (PyObject *)v;
3359 p = PyUnicode_AS_UNICODE(v);
3360 while (size-- > 0)
3361 *p++ = (unsigned char)*s++;
3362 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003363
Guido van Rossumd57fd912000-03-10 22:53:23 +00003364 onError:
3365 Py_XDECREF(v);
3366 return NULL;
3367}
3368
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003369/* create or adjust a UnicodeEncodeError */
3370static void make_encode_exception(PyObject **exceptionObject,
3371 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003372 const Py_UNICODE *unicode, Py_ssize_t size,
3373 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003374 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003375{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003376 if (*exceptionObject == NULL) {
3377 *exceptionObject = PyUnicodeEncodeError_Create(
3378 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003379 }
3380 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003381 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3382 goto onError;
3383 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3384 goto onError;
3385 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3386 goto onError;
3387 return;
3388 onError:
3389 Py_DECREF(*exceptionObject);
3390 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003391 }
3392}
3393
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003394/* raises a UnicodeEncodeError */
3395static void raise_encode_exception(PyObject **exceptionObject,
3396 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003397 const Py_UNICODE *unicode, Py_ssize_t size,
3398 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003399 const char *reason)
3400{
3401 make_encode_exception(exceptionObject,
3402 encoding, unicode, size, startpos, endpos, reason);
3403 if (*exceptionObject != NULL)
3404 PyCodec_StrictErrors(*exceptionObject);
3405}
3406
3407/* error handling callback helper:
3408 build arguments, call the callback and check the arguments,
3409 put the result into newpos and return the replacement string, which
3410 has to be freed by the caller */
3411static PyObject *unicode_encode_call_errorhandler(const char *errors,
3412 PyObject **errorHandler,
3413 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003414 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3415 Py_ssize_t startpos, Py_ssize_t endpos,
3416 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003417{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003418 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003419
3420 PyObject *restuple;
3421 PyObject *resunicode;
3422
3423 if (*errorHandler == NULL) {
3424 *errorHandler = PyCodec_LookupError(errors);
3425 if (*errorHandler == NULL)
3426 return NULL;
3427 }
3428
3429 make_encode_exception(exceptionObject,
3430 encoding, unicode, size, startpos, endpos, reason);
3431 if (*exceptionObject == NULL)
3432 return NULL;
3433
3434 restuple = PyObject_CallFunctionObjArgs(
3435 *errorHandler, *exceptionObject, NULL);
3436 if (restuple == NULL)
3437 return NULL;
3438 if (!PyTuple_Check(restuple)) {
3439 PyErr_Format(PyExc_TypeError, &argparse[4]);
3440 Py_DECREF(restuple);
3441 return NULL;
3442 }
3443 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3444 &resunicode, newpos)) {
3445 Py_DECREF(restuple);
3446 return NULL;
3447 }
3448 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003449 *newpos = size+*newpos;
3450 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003451 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003452 Py_DECREF(restuple);
3453 return NULL;
3454 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003455 Py_INCREF(resunicode);
3456 Py_DECREF(restuple);
3457 return resunicode;
3458}
3459
3460static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003461 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003462 const char *errors,
3463 int limit)
3464{
3465 /* output object */
3466 PyObject *res;
3467 /* pointers to the beginning and end+1 of input */
3468 const Py_UNICODE *startp = p;
3469 const Py_UNICODE *endp = p + size;
3470 /* pointer to the beginning of the unencodable characters */
3471 /* const Py_UNICODE *badp = NULL; */
3472 /* pointer into the output */
3473 char *str;
3474 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003475 Py_ssize_t respos = 0;
3476 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00003477 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3478 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003479 PyObject *errorHandler = NULL;
3480 PyObject *exc = NULL;
3481 /* the following variable is used for caching string comparisons
3482 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3483 int known_errorHandler = -1;
3484
3485 /* allocate enough for a simple encoding without
3486 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003487 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003488 if (res == NULL)
3489 goto onError;
3490 if (size == 0)
3491 return res;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003492 str = PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003493 ressize = size;
3494
3495 while (p<endp) {
3496 Py_UNICODE c = *p;
3497
3498 /* can we encode this? */
3499 if (c<limit) {
3500 /* no overflow check, because we know that the space is enough */
3501 *str++ = (char)c;
3502 ++p;
3503 }
3504 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003505 Py_ssize_t unicodepos = p-startp;
3506 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003507 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003508 Py_ssize_t repsize;
3509 Py_ssize_t newpos;
3510 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003511 Py_UNICODE *uni2;
3512 /* startpos for collecting unencodable chars */
3513 const Py_UNICODE *collstart = p;
3514 const Py_UNICODE *collend = p;
3515 /* find all unecodable characters */
3516 while ((collend < endp) && ((*collend)>=limit))
3517 ++collend;
3518 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3519 if (known_errorHandler==-1) {
3520 if ((errors==NULL) || (!strcmp(errors, "strict")))
3521 known_errorHandler = 1;
3522 else if (!strcmp(errors, "replace"))
3523 known_errorHandler = 2;
3524 else if (!strcmp(errors, "ignore"))
3525 known_errorHandler = 3;
3526 else if (!strcmp(errors, "xmlcharrefreplace"))
3527 known_errorHandler = 4;
3528 else
3529 known_errorHandler = 0;
3530 }
3531 switch (known_errorHandler) {
3532 case 1: /* strict */
3533 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3534 goto onError;
3535 case 2: /* replace */
3536 while (collstart++<collend)
3537 *str++ = '?'; /* fall through */
3538 case 3: /* ignore */
3539 p = collend;
3540 break;
3541 case 4: /* xmlcharrefreplace */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003542 respos = str-PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003543 /* determine replacement size (temporarily (mis)uses p) */
3544 for (p = collstart, repsize = 0; p < collend; ++p) {
3545 if (*p<10)
3546 repsize += 2+1+1;
3547 else if (*p<100)
3548 repsize += 2+2+1;
3549 else if (*p<1000)
3550 repsize += 2+3+1;
3551 else if (*p<10000)
3552 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003553#ifndef Py_UNICODE_WIDE
3554 else
3555 repsize += 2+5+1;
3556#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003557 else if (*p<100000)
3558 repsize += 2+5+1;
3559 else if (*p<1000000)
3560 repsize += 2+6+1;
3561 else
3562 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003563#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003564 }
3565 requiredsize = respos+repsize+(endp-collend);
3566 if (requiredsize > ressize) {
3567 if (requiredsize<2*ressize)
3568 requiredsize = 2*ressize;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003569 if (_PyString_Resize(&res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003570 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003571 str = PyString_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003572 ressize = requiredsize;
3573 }
3574 /* generate replacement (temporarily (mis)uses p) */
3575 for (p = collstart; p < collend; ++p) {
3576 str += sprintf(str, "&#%d;", (int)*p);
3577 }
3578 p = collend;
3579 break;
3580 default:
3581 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3582 encoding, reason, startp, size, &exc,
3583 collstart-startp, collend-startp, &newpos);
3584 if (repunicode == NULL)
3585 goto onError;
3586 /* need more space? (at least enough for what we
3587 have+the replacement+the rest of the string, so
3588 we won't have to check space for encodable characters) */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003589 respos = str-PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003590 repsize = PyUnicode_GET_SIZE(repunicode);
3591 requiredsize = respos+repsize+(endp-collend);
3592 if (requiredsize > ressize) {
3593 if (requiredsize<2*ressize)
3594 requiredsize = 2*ressize;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003595 if (_PyString_Resize(&res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003596 Py_DECREF(repunicode);
3597 goto onError;
3598 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003599 str = PyString_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003600 ressize = requiredsize;
3601 }
3602 /* check if there is anything unencodable in the replacement
3603 and copy it to the output */
3604 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3605 c = *uni2;
3606 if (c >= limit) {
3607 raise_encode_exception(&exc, encoding, startp, size,
3608 unicodepos, unicodepos+1, reason);
3609 Py_DECREF(repunicode);
3610 goto onError;
3611 }
3612 *str = (char)c;
3613 }
3614 p = startp + newpos;
3615 Py_DECREF(repunicode);
3616 }
3617 }
3618 }
3619 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003620 respos = str-PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003621 if (respos<ressize)
3622 /* If this falls res will be NULL */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003623 _PyString_Resize(&res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003624 Py_XDECREF(errorHandler);
3625 Py_XDECREF(exc);
3626 return res;
3627
3628 onError:
3629 Py_XDECREF(res);
3630 Py_XDECREF(errorHandler);
3631 Py_XDECREF(exc);
3632 return NULL;
3633}
3634
Guido van Rossumd57fd912000-03-10 22:53:23 +00003635PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003636 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003637 const char *errors)
3638{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003639 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003640}
3641
3642PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3643{
3644 if (!PyUnicode_Check(unicode)) {
3645 PyErr_BadArgument();
3646 return NULL;
3647 }
3648 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3649 PyUnicode_GET_SIZE(unicode),
3650 NULL);
3651}
3652
3653/* --- 7-bit ASCII Codec -------------------------------------------------- */
3654
Guido van Rossumd57fd912000-03-10 22:53:23 +00003655PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003656 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003657 const char *errors)
3658{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003659 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003660 PyUnicodeObject *v;
3661 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003662 Py_ssize_t startinpos;
3663 Py_ssize_t endinpos;
3664 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003665 const char *e;
3666 PyObject *errorHandler = NULL;
3667 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003668
Guido van Rossumd57fd912000-03-10 22:53:23 +00003669 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003670 if (size == 1 && *(unsigned char*)s < 128) {
3671 Py_UNICODE r = *(unsigned char*)s;
3672 return PyUnicode_FromUnicode(&r, 1);
3673 }
Tim Petersced69f82003-09-16 20:30:58 +00003674
Guido van Rossumd57fd912000-03-10 22:53:23 +00003675 v = _PyUnicode_New(size);
3676 if (v == NULL)
3677 goto onError;
3678 if (size == 0)
3679 return (PyObject *)v;
3680 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003681 e = s + size;
3682 while (s < e) {
3683 register unsigned char c = (unsigned char)*s;
3684 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003685 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003686 ++s;
3687 }
3688 else {
3689 startinpos = s-starts;
3690 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003691 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003692 if (unicode_decode_call_errorhandler(
3693 errors, &errorHandler,
3694 "ascii", "ordinal not in range(128)",
3695 starts, size, &startinpos, &endinpos, &exc, &s,
3696 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003697 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003698 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003699 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003700 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00003701 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003702 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003703 Py_XDECREF(errorHandler);
3704 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003705 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003706
Guido van Rossumd57fd912000-03-10 22:53:23 +00003707 onError:
3708 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003709 Py_XDECREF(errorHandler);
3710 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003711 return NULL;
3712}
3713
Guido van Rossumd57fd912000-03-10 22:53:23 +00003714PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003715 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003716 const char *errors)
3717{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003718 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003719}
3720
3721PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3722{
3723 if (!PyUnicode_Check(unicode)) {
3724 PyErr_BadArgument();
3725 return NULL;
3726 }
3727 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3728 PyUnicode_GET_SIZE(unicode),
3729 NULL);
3730}
3731
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003732#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003733
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003734/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003735
Martin v. Löwisd8251432006-06-14 05:21:04 +00003736#if SIZEOF_INT < SIZEOF_SSIZE_T
3737#define NEED_RETRY
3738#endif
3739
3740/* XXX This code is limited to "true" double-byte encodings, as
3741 a) it assumes an incomplete character consists of a single byte, and
3742 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3743 encodings, see IsDBCSLeadByteEx documentation. */
3744
3745static int is_dbcs_lead_byte(const char *s, int offset)
3746{
3747 const char *curr = s + offset;
3748
3749 if (IsDBCSLeadByte(*curr)) {
3750 const char *prev = CharPrev(s, curr);
3751 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3752 }
3753 return 0;
3754}
3755
3756/*
3757 * Decode MBCS string into unicode object. If 'final' is set, converts
3758 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3759 */
3760static int decode_mbcs(PyUnicodeObject **v,
3761 const char *s, /* MBCS string */
3762 int size, /* sizeof MBCS string */
3763 int final)
3764{
3765 Py_UNICODE *p;
3766 Py_ssize_t n = 0;
3767 int usize = 0;
3768
3769 assert(size >= 0);
3770
3771 /* Skip trailing lead-byte unless 'final' is set */
3772 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3773 --size;
3774
3775 /* First get the size of the result */
3776 if (size > 0) {
3777 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3778 if (usize == 0) {
3779 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3780 return -1;
3781 }
3782 }
3783
3784 if (*v == NULL) {
3785 /* Create unicode object */
3786 *v = _PyUnicode_New(usize);
3787 if (*v == NULL)
3788 return -1;
3789 }
3790 else {
3791 /* Extend unicode object */
3792 n = PyUnicode_GET_SIZE(*v);
3793 if (_PyUnicode_Resize(v, n + usize) < 0)
3794 return -1;
3795 }
3796
3797 /* Do the conversion */
3798 if (size > 0) {
3799 p = PyUnicode_AS_UNICODE(*v) + n;
3800 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3801 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3802 return -1;
3803 }
3804 }
3805
3806 return size;
3807}
3808
3809PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3810 Py_ssize_t size,
3811 const char *errors,
3812 Py_ssize_t *consumed)
3813{
3814 PyUnicodeObject *v = NULL;
3815 int done;
3816
3817 if (consumed)
3818 *consumed = 0;
3819
3820#ifdef NEED_RETRY
3821 retry:
3822 if (size > INT_MAX)
3823 done = decode_mbcs(&v, s, INT_MAX, 0);
3824 else
3825#endif
3826 done = decode_mbcs(&v, s, (int)size, !consumed);
3827
3828 if (done < 0) {
3829 Py_XDECREF(v);
3830 return NULL;
3831 }
3832
3833 if (consumed)
3834 *consumed += done;
3835
3836#ifdef NEED_RETRY
3837 if (size > INT_MAX) {
3838 s += done;
3839 size -= done;
3840 goto retry;
3841 }
3842#endif
3843
3844 return (PyObject *)v;
3845}
3846
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003847PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003848 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003849 const char *errors)
3850{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003851 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3852}
3853
3854/*
3855 * Convert unicode into string object (MBCS).
3856 * Returns 0 if succeed, -1 otherwise.
3857 */
3858static int encode_mbcs(PyObject **repr,
3859 const Py_UNICODE *p, /* unicode */
3860 int size) /* size of unicode */
3861{
3862 int mbcssize = 0;
3863 Py_ssize_t n = 0;
3864
3865 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003866
3867 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003868 if (size > 0) {
3869 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3870 if (mbcssize == 0) {
3871 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3872 return -1;
3873 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003874 }
3875
Martin v. Löwisd8251432006-06-14 05:21:04 +00003876 if (*repr == NULL) {
3877 /* Create string object */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003878 *repr = PyString_FromStringAndSize(NULL, mbcssize);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003879 if (*repr == NULL)
3880 return -1;
3881 }
3882 else {
3883 /* Extend string object */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003884 n = PyString_Size(*repr);
3885 if (_PyString_Resize(repr, n + mbcssize) < 0)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003886 return -1;
3887 }
3888
3889 /* Do the conversion */
3890 if (size > 0) {
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003891 char *s = PyString_AS_STRING(*repr) + n;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003892 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3893 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3894 return -1;
3895 }
3896 }
3897
3898 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003899}
3900
3901PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003902 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003903 const char *errors)
3904{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003905 PyObject *repr = NULL;
3906 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003907
Martin v. Löwisd8251432006-06-14 05:21:04 +00003908#ifdef NEED_RETRY
3909 retry:
3910 if (size > INT_MAX)
3911 ret = encode_mbcs(&repr, p, INT_MAX);
3912 else
3913#endif
3914 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003915
Martin v. Löwisd8251432006-06-14 05:21:04 +00003916 if (ret < 0) {
3917 Py_XDECREF(repr);
3918 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003919 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003920
3921#ifdef NEED_RETRY
3922 if (size > INT_MAX) {
3923 p += INT_MAX;
3924 size -= INT_MAX;
3925 goto retry;
3926 }
3927#endif
3928
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003929 return repr;
3930}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003931
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003932PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3933{
3934 if (!PyUnicode_Check(unicode)) {
3935 PyErr_BadArgument();
3936 return NULL;
3937 }
3938 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3939 PyUnicode_GET_SIZE(unicode),
3940 NULL);
3941}
3942
Martin v. Löwisd8251432006-06-14 05:21:04 +00003943#undef NEED_RETRY
3944
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003945#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003946
Guido van Rossumd57fd912000-03-10 22:53:23 +00003947/* --- Character Mapping Codec -------------------------------------------- */
3948
Guido van Rossumd57fd912000-03-10 22:53:23 +00003949PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003950 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003951 PyObject *mapping,
3952 const char *errors)
3953{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003954 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003955 Py_ssize_t startinpos;
3956 Py_ssize_t endinpos;
3957 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003958 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003959 PyUnicodeObject *v;
3960 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003961 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003962 PyObject *errorHandler = NULL;
3963 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003964 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003965 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003966
Guido van Rossumd57fd912000-03-10 22:53:23 +00003967 /* Default to Latin-1 */
3968 if (mapping == NULL)
3969 return PyUnicode_DecodeLatin1(s, size, errors);
3970
3971 v = _PyUnicode_New(size);
3972 if (v == NULL)
3973 goto onError;
3974 if (size == 0)
3975 return (PyObject *)v;
3976 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003977 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003978 if (PyUnicode_CheckExact(mapping)) {
3979 mapstring = PyUnicode_AS_UNICODE(mapping);
3980 maplen = PyUnicode_GET_SIZE(mapping);
3981 while (s < e) {
3982 unsigned char ch = *s;
3983 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003984
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003985 if (ch < maplen)
3986 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003987
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003988 if (x == 0xfffe) {
3989 /* undefined mapping */
3990 outpos = p-PyUnicode_AS_UNICODE(v);
3991 startinpos = s-starts;
3992 endinpos = startinpos+1;
3993 if (unicode_decode_call_errorhandler(
3994 errors, &errorHandler,
3995 "charmap", "character maps to <undefined>",
3996 starts, size, &startinpos, &endinpos, &exc, &s,
3997 (PyObject **)&v, &outpos, &p)) {
3998 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003999 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004000 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00004001 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004002 *p++ = x;
4003 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004004 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004005 }
4006 else {
4007 while (s < e) {
4008 unsigned char ch = *s;
4009 PyObject *w, *x;
4010
4011 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4012 w = PyInt_FromLong((long)ch);
4013 if (w == NULL)
4014 goto onError;
4015 x = PyObject_GetItem(mapping, w);
4016 Py_DECREF(w);
4017 if (x == NULL) {
4018 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4019 /* No mapping found means: mapping is undefined. */
4020 PyErr_Clear();
4021 x = Py_None;
4022 Py_INCREF(x);
4023 } else
4024 goto onError;
4025 }
4026
4027 /* Apply mapping */
4028 if (PyInt_Check(x)) {
4029 long value = PyInt_AS_LONG(x);
4030 if (value < 0 || value > 65535) {
4031 PyErr_SetString(PyExc_TypeError,
4032 "character mapping must be in range(65536)");
4033 Py_DECREF(x);
4034 goto onError;
4035 }
4036 *p++ = (Py_UNICODE)value;
4037 }
4038 else if (x == Py_None) {
4039 /* undefined mapping */
4040 outpos = p-PyUnicode_AS_UNICODE(v);
4041 startinpos = s-starts;
4042 endinpos = startinpos+1;
4043 if (unicode_decode_call_errorhandler(
4044 errors, &errorHandler,
4045 "charmap", "character maps to <undefined>",
4046 starts, size, &startinpos, &endinpos, &exc, &s,
4047 (PyObject **)&v, &outpos, &p)) {
4048 Py_DECREF(x);
4049 goto onError;
4050 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00004051 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004052 continue;
4053 }
4054 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004055 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004056
4057 if (targetsize == 1)
4058 /* 1-1 mapping */
4059 *p++ = *PyUnicode_AS_UNICODE(x);
4060
4061 else if (targetsize > 1) {
4062 /* 1-n mapping */
4063 if (targetsize > extrachars) {
4064 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004065 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4066 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004067 (targetsize << 2);
4068 extrachars += needed;
Armin Rigo7ccbca92006-10-04 12:17:45 +00004069 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004070 if (_PyUnicode_Resize(&v,
4071 PyUnicode_GET_SIZE(v) + needed) < 0) {
4072 Py_DECREF(x);
4073 goto onError;
4074 }
4075 p = PyUnicode_AS_UNICODE(v) + oldpos;
4076 }
4077 Py_UNICODE_COPY(p,
4078 PyUnicode_AS_UNICODE(x),
4079 targetsize);
4080 p += targetsize;
4081 extrachars -= targetsize;
4082 }
4083 /* 1-0 mapping: skip the character */
4084 }
4085 else {
4086 /* wrong return value */
4087 PyErr_SetString(PyExc_TypeError,
4088 "character mapping must return integer, None or unicode");
4089 Py_DECREF(x);
4090 goto onError;
4091 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004092 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004093 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004094 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004095 }
4096 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00004097 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004098 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004099 Py_XDECREF(errorHandler);
4100 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004101 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004102
Guido van Rossumd57fd912000-03-10 22:53:23 +00004103 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004104 Py_XDECREF(errorHandler);
4105 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004106 Py_XDECREF(v);
4107 return NULL;
4108}
4109
Martin v. Löwis3f767792006-06-04 19:36:28 +00004110/* Charmap encoding: the lookup table */
4111
4112struct encoding_map{
4113 PyObject_HEAD
4114 unsigned char level1[32];
4115 int count2, count3;
4116 unsigned char level23[1];
4117};
4118
4119static PyObject*
4120encoding_map_size(PyObject *obj, PyObject* args)
4121{
4122 struct encoding_map *map = (struct encoding_map*)obj;
4123 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4124 128*map->count3);
4125}
4126
4127static PyMethodDef encoding_map_methods[] = {
4128 {"size", encoding_map_size, METH_NOARGS,
4129 PyDoc_STR("Return the size (in bytes) of this object") },
4130 { 0 }
4131};
4132
4133static void
4134encoding_map_dealloc(PyObject* o)
4135{
4136 PyObject_FREE(o);
4137}
4138
4139static PyTypeObject EncodingMapType = {
Martin v. Löwis68192102007-07-21 06:55:02 +00004140 PyVarObject_HEAD_INIT(NULL, 0)
Martin v. Löwis3f767792006-06-04 19:36:28 +00004141 "EncodingMap", /*tp_name*/
4142 sizeof(struct encoding_map), /*tp_basicsize*/
4143 0, /*tp_itemsize*/
4144 /* methods */
4145 encoding_map_dealloc, /*tp_dealloc*/
4146 0, /*tp_print*/
4147 0, /*tp_getattr*/
4148 0, /*tp_setattr*/
4149 0, /*tp_compare*/
4150 0, /*tp_repr*/
4151 0, /*tp_as_number*/
4152 0, /*tp_as_sequence*/
4153 0, /*tp_as_mapping*/
4154 0, /*tp_hash*/
4155 0, /*tp_call*/
4156 0, /*tp_str*/
4157 0, /*tp_getattro*/
4158 0, /*tp_setattro*/
4159 0, /*tp_as_buffer*/
4160 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4161 0, /*tp_doc*/
4162 0, /*tp_traverse*/
4163 0, /*tp_clear*/
4164 0, /*tp_richcompare*/
4165 0, /*tp_weaklistoffset*/
4166 0, /*tp_iter*/
4167 0, /*tp_iternext*/
4168 encoding_map_methods, /*tp_methods*/
4169 0, /*tp_members*/
4170 0, /*tp_getset*/
4171 0, /*tp_base*/
4172 0, /*tp_dict*/
4173 0, /*tp_descr_get*/
4174 0, /*tp_descr_set*/
4175 0, /*tp_dictoffset*/
4176 0, /*tp_init*/
4177 0, /*tp_alloc*/
4178 0, /*tp_new*/
4179 0, /*tp_free*/
4180 0, /*tp_is_gc*/
4181};
4182
4183PyObject*
4184PyUnicode_BuildEncodingMap(PyObject* string)
4185{
4186 Py_UNICODE *decode;
4187 PyObject *result;
4188 struct encoding_map *mresult;
4189 int i;
4190 int need_dict = 0;
4191 unsigned char level1[32];
4192 unsigned char level2[512];
4193 unsigned char *mlevel1, *mlevel2, *mlevel3;
4194 int count2 = 0, count3 = 0;
4195
4196 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4197 PyErr_BadArgument();
4198 return NULL;
4199 }
4200 decode = PyUnicode_AS_UNICODE(string);
4201 memset(level1, 0xFF, sizeof level1);
4202 memset(level2, 0xFF, sizeof level2);
4203
4204 /* If there isn't a one-to-one mapping of NULL to \0,
4205 or if there are non-BMP characters, we need to use
4206 a mapping dictionary. */
4207 if (decode[0] != 0)
4208 need_dict = 1;
4209 for (i = 1; i < 256; i++) {
4210 int l1, l2;
4211 if (decode[i] == 0
4212 #ifdef Py_UNICODE_WIDE
4213 || decode[i] > 0xFFFF
4214 #endif
4215 ) {
4216 need_dict = 1;
4217 break;
4218 }
4219 if (decode[i] == 0xFFFE)
4220 /* unmapped character */
4221 continue;
4222 l1 = decode[i] >> 11;
4223 l2 = decode[i] >> 7;
4224 if (level1[l1] == 0xFF)
4225 level1[l1] = count2++;
4226 if (level2[l2] == 0xFF)
4227 level2[l2] = count3++;
4228 }
4229
4230 if (count2 >= 0xFF || count3 >= 0xFF)
4231 need_dict = 1;
4232
4233 if (need_dict) {
4234 PyObject *result = PyDict_New();
4235 PyObject *key, *value;
4236 if (!result)
4237 return NULL;
4238 for (i = 0; i < 256; i++) {
4239 key = value = NULL;
4240 key = PyInt_FromLong(decode[i]);
4241 value = PyInt_FromLong(i);
4242 if (!key || !value)
4243 goto failed1;
4244 if (PyDict_SetItem(result, key, value) == -1)
4245 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00004246 Py_DECREF(key);
4247 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004248 }
4249 return result;
4250 failed1:
4251 Py_XDECREF(key);
4252 Py_XDECREF(value);
4253 Py_DECREF(result);
4254 return NULL;
4255 }
4256
4257 /* Create a three-level trie */
4258 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4259 16*count2 + 128*count3 - 1);
4260 if (!result)
4261 return PyErr_NoMemory();
4262 PyObject_Init(result, &EncodingMapType);
4263 mresult = (struct encoding_map*)result;
4264 mresult->count2 = count2;
4265 mresult->count3 = count3;
4266 mlevel1 = mresult->level1;
4267 mlevel2 = mresult->level23;
4268 mlevel3 = mresult->level23 + 16*count2;
4269 memcpy(mlevel1, level1, 32);
4270 memset(mlevel2, 0xFF, 16*count2);
4271 memset(mlevel3, 0, 128*count3);
4272 count3 = 0;
4273 for (i = 1; i < 256; i++) {
4274 int o1, o2, o3, i2, i3;
4275 if (decode[i] == 0xFFFE)
4276 /* unmapped character */
4277 continue;
4278 o1 = decode[i]>>11;
4279 o2 = (decode[i]>>7) & 0xF;
4280 i2 = 16*mlevel1[o1] + o2;
4281 if (mlevel2[i2] == 0xFF)
4282 mlevel2[i2] = count3++;
4283 o3 = decode[i] & 0x7F;
4284 i3 = 128*mlevel2[i2] + o3;
4285 mlevel3[i3] = i;
4286 }
4287 return result;
4288}
4289
4290static int
4291encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4292{
4293 struct encoding_map *map = (struct encoding_map*)mapping;
4294 int l1 = c>>11;
4295 int l2 = (c>>7) & 0xF;
4296 int l3 = c & 0x7F;
4297 int i;
4298
4299#ifdef Py_UNICODE_WIDE
4300 if (c > 0xFFFF) {
4301 return -1;
4302 }
4303#endif
4304 if (c == 0)
4305 return 0;
4306 /* level 1*/
4307 i = map->level1[l1];
4308 if (i == 0xFF) {
4309 return -1;
4310 }
4311 /* level 2*/
4312 i = map->level23[16*i+l2];
4313 if (i == 0xFF) {
4314 return -1;
4315 }
4316 /* level 3 */
4317 i = map->level23[16*map->count2 + 128*i + l3];
4318 if (i == 0) {
4319 return -1;
4320 }
4321 return i;
4322}
4323
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004324/* Lookup the character ch in the mapping. If the character
4325 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004326 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004327static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004328{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004329 PyObject *w = PyInt_FromLong((long)c);
4330 PyObject *x;
4331
4332 if (w == NULL)
4333 return NULL;
4334 x = PyObject_GetItem(mapping, w);
4335 Py_DECREF(w);
4336 if (x == NULL) {
4337 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4338 /* No mapping found means: mapping is undefined. */
4339 PyErr_Clear();
4340 x = Py_None;
4341 Py_INCREF(x);
4342 return x;
4343 } else
4344 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004345 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004346 else if (x == Py_None)
4347 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004348 else if (PyInt_Check(x)) {
4349 long value = PyInt_AS_LONG(x);
4350 if (value < 0 || value > 255) {
4351 PyErr_SetString(PyExc_TypeError,
4352 "character mapping must be in range(256)");
4353 Py_DECREF(x);
4354 return NULL;
4355 }
4356 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004357 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004358 else if (PyString_Check(x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004359 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004360 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004361 /* wrong return value */
4362 PyErr_SetString(PyExc_TypeError,
4363 "character mapping must return integer, None or str");
4364 Py_DECREF(x);
4365 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004366 }
4367}
4368
Martin v. Löwis3f767792006-06-04 19:36:28 +00004369static int
4370charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4371{
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004372 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004373 /* exponentially overallocate to minimize reallocations */
4374 if (requiredsize < 2*outsize)
4375 requiredsize = 2*outsize;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004376 if (_PyString_Resize(outobj, requiredsize)) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004377 return 0;
4378 }
4379 return 1;
4380}
4381
4382typedef enum charmapencode_result {
4383 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4384}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004385/* lookup the character, put the result in the output string and adjust
4386 various state variables. Reallocate the output string if not enough
4387 space is available. Return a new reference to the object that
4388 was put in the output buffer, or Py_None, if the mapping was undefined
4389 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004390 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004391static
Martin v. Löwis3f767792006-06-04 19:36:28 +00004392charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004393 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004394{
Martin v. Löwis3f767792006-06-04 19:36:28 +00004395 PyObject *rep;
4396 char *outstart;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004397 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004398
Christian Heimese93237d2007-12-19 02:37:44 +00004399 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004400 int res = encoding_map_lookup(c, mapping);
4401 Py_ssize_t requiredsize = *outpos+1;
4402 if (res == -1)
4403 return enc_FAILED;
4404 if (outsize<requiredsize)
4405 if (!charmapencode_resize(outobj, outpos, requiredsize))
4406 return enc_EXCEPTION;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004407 outstart = PyString_AS_STRING(*outobj);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004408 outstart[(*outpos)++] = (char)res;
4409 return enc_SUCCESS;
4410 }
4411
4412 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004413 if (rep==NULL)
Martin v. Löwis3f767792006-06-04 19:36:28 +00004414 return enc_EXCEPTION;
4415 else if (rep==Py_None) {
4416 Py_DECREF(rep);
4417 return enc_FAILED;
4418 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004419 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004420 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004421 if (outsize<requiredsize)
4422 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004423 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004424 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004425 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004426 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004427 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4428 }
4429 else {
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004430 const char *repchars = PyString_AS_STRING(rep);
4431 Py_ssize_t repsize = PyString_GET_SIZE(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004432 Py_ssize_t requiredsize = *outpos+repsize;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004433 if (outsize<requiredsize)
4434 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004435 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004436 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004437 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004438 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004439 memcpy(outstart + *outpos, repchars, repsize);
4440 *outpos += repsize;
4441 }
4442 }
Georg Brandl9f167602006-06-04 21:46:16 +00004443 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004444 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004445}
4446
4447/* handle an error in PyUnicode_EncodeCharmap
4448 Return 0 on success, -1 on error */
4449static
4450int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004451 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004452 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004453 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004454 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004455{
4456 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004457 Py_ssize_t repsize;
4458 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004459 Py_UNICODE *uni2;
4460 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004461 Py_ssize_t collstartpos = *inpos;
4462 Py_ssize_t collendpos = *inpos+1;
4463 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004464 char *encoding = "charmap";
4465 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00004466 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004467
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004468 /* find all unencodable characters */
4469 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004470 PyObject *rep;
Christian Heimese93237d2007-12-19 02:37:44 +00004471 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004472 int res = encoding_map_lookup(p[collendpos], mapping);
4473 if (res != -1)
4474 break;
4475 ++collendpos;
4476 continue;
4477 }
4478
4479 rep = charmapencode_lookup(p[collendpos], mapping);
4480 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004481 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004482 else if (rep!=Py_None) {
4483 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004484 break;
4485 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004486 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004487 ++collendpos;
4488 }
4489 /* cache callback name lookup
4490 * (if not done yet, i.e. it's the first error) */
4491 if (*known_errorHandler==-1) {
4492 if ((errors==NULL) || (!strcmp(errors, "strict")))
4493 *known_errorHandler = 1;
4494 else if (!strcmp(errors, "replace"))
4495 *known_errorHandler = 2;
4496 else if (!strcmp(errors, "ignore"))
4497 *known_errorHandler = 3;
4498 else if (!strcmp(errors, "xmlcharrefreplace"))
4499 *known_errorHandler = 4;
4500 else
4501 *known_errorHandler = 0;
4502 }
4503 switch (*known_errorHandler) {
4504 case 1: /* strict */
4505 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4506 return -1;
4507 case 2: /* replace */
4508 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4509 x = charmapencode_output('?', mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004510 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004511 return -1;
4512 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004513 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004514 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4515 return -1;
4516 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004517 }
4518 /* fall through */
4519 case 3: /* ignore */
4520 *inpos = collendpos;
4521 break;
4522 case 4: /* xmlcharrefreplace */
4523 /* generate replacement (temporarily (mis)uses p) */
4524 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4525 char buffer[2+29+1+1];
4526 char *cp;
4527 sprintf(buffer, "&#%d;", (int)p[collpos]);
4528 for (cp = buffer; *cp; ++cp) {
4529 x = charmapencode_output(*cp, mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004530 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004531 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004532 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004533 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4534 return -1;
4535 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004536 }
4537 }
4538 *inpos = collendpos;
4539 break;
4540 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004541 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004542 encoding, reason, p, size, exceptionObject,
4543 collstartpos, collendpos, &newpos);
4544 if (repunicode == NULL)
4545 return -1;
4546 /* generate replacement */
4547 repsize = PyUnicode_GET_SIZE(repunicode);
4548 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4549 x = charmapencode_output(*uni2, mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004550 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004551 return -1;
4552 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004553 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004554 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004555 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4556 return -1;
4557 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004558 }
4559 *inpos = newpos;
4560 Py_DECREF(repunicode);
4561 }
4562 return 0;
4563}
4564
Guido van Rossumd57fd912000-03-10 22:53:23 +00004565PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004566 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004567 PyObject *mapping,
4568 const char *errors)
4569{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004570 /* output object */
4571 PyObject *res = NULL;
4572 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004573 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004574 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004575 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004576 PyObject *errorHandler = NULL;
4577 PyObject *exc = NULL;
4578 /* the following variable is used for caching string comparisons
4579 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4580 * 3=ignore, 4=xmlcharrefreplace */
4581 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004582
4583 /* Default to Latin-1 */
4584 if (mapping == NULL)
4585 return PyUnicode_EncodeLatin1(p, size, errors);
4586
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004587 /* allocate enough for a simple encoding without
4588 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004589 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004590 if (res == NULL)
4591 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004592 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004593 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004594
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004595 while (inpos<size) {
4596 /* try to encode it */
Martin v. Löwis3f767792006-06-04 19:36:28 +00004597 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4598 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004599 goto onError;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004600 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004601 if (charmap_encoding_error(p, size, &inpos, mapping,
4602 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004603 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00004604 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004605 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004606 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004607 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004608 else
4609 /* done with this character => adjust input position */
4610 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004611 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004612
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004613 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004614 if (respos<PyString_GET_SIZE(res)) {
4615 if (_PyString_Resize(&res, respos))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004616 goto onError;
4617 }
4618 Py_XDECREF(exc);
4619 Py_XDECREF(errorHandler);
4620 return res;
4621
4622 onError:
4623 Py_XDECREF(res);
4624 Py_XDECREF(exc);
4625 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004626 return NULL;
4627}
4628
4629PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4630 PyObject *mapping)
4631{
4632 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4633 PyErr_BadArgument();
4634 return NULL;
4635 }
4636 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4637 PyUnicode_GET_SIZE(unicode),
4638 mapping,
4639 NULL);
4640}
4641
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004642/* create or adjust a UnicodeTranslateError */
4643static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004644 const Py_UNICODE *unicode, Py_ssize_t size,
4645 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004646 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004647{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004648 if (*exceptionObject == NULL) {
4649 *exceptionObject = PyUnicodeTranslateError_Create(
4650 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004651 }
4652 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004653 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4654 goto onError;
4655 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4656 goto onError;
4657 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4658 goto onError;
4659 return;
4660 onError:
4661 Py_DECREF(*exceptionObject);
4662 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004663 }
4664}
4665
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004666/* raises a UnicodeTranslateError */
4667static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004668 const Py_UNICODE *unicode, Py_ssize_t size,
4669 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004670 const char *reason)
4671{
4672 make_translate_exception(exceptionObject,
4673 unicode, size, startpos, endpos, reason);
4674 if (*exceptionObject != NULL)
4675 PyCodec_StrictErrors(*exceptionObject);
4676}
4677
4678/* error handling callback helper:
4679 build arguments, call the callback and check the arguments,
4680 put the result into newpos and return the replacement string, which
4681 has to be freed by the caller */
4682static PyObject *unicode_translate_call_errorhandler(const char *errors,
4683 PyObject **errorHandler,
4684 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004685 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4686 Py_ssize_t startpos, Py_ssize_t endpos,
4687 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004688{
Martin v. Löwis412fb672006-04-13 06:34:32 +00004689 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004690
Martin v. Löwis412fb672006-04-13 06:34:32 +00004691 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004692 PyObject *restuple;
4693 PyObject *resunicode;
4694
4695 if (*errorHandler == NULL) {
4696 *errorHandler = PyCodec_LookupError(errors);
4697 if (*errorHandler == NULL)
4698 return NULL;
4699 }
4700
4701 make_translate_exception(exceptionObject,
4702 unicode, size, startpos, endpos, reason);
4703 if (*exceptionObject == NULL)
4704 return NULL;
4705
4706 restuple = PyObject_CallFunctionObjArgs(
4707 *errorHandler, *exceptionObject, NULL);
4708 if (restuple == NULL)
4709 return NULL;
4710 if (!PyTuple_Check(restuple)) {
4711 PyErr_Format(PyExc_TypeError, &argparse[4]);
4712 Py_DECREF(restuple);
4713 return NULL;
4714 }
4715 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004716 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004717 Py_DECREF(restuple);
4718 return NULL;
4719 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004720 if (i_newpos<0)
4721 *newpos = size+i_newpos;
4722 else
4723 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004724 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004725 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004726 Py_DECREF(restuple);
4727 return NULL;
4728 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004729 Py_INCREF(resunicode);
4730 Py_DECREF(restuple);
4731 return resunicode;
4732}
4733
4734/* Lookup the character ch in the mapping and put the result in result,
4735 which must be decrefed by the caller.
4736 Return 0 on success, -1 on error */
4737static
4738int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4739{
4740 PyObject *w = PyInt_FromLong((long)c);
4741 PyObject *x;
4742
4743 if (w == NULL)
4744 return -1;
4745 x = PyObject_GetItem(mapping, w);
4746 Py_DECREF(w);
4747 if (x == NULL) {
4748 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4749 /* No mapping found means: use 1:1 mapping. */
4750 PyErr_Clear();
4751 *result = NULL;
4752 return 0;
4753 } else
4754 return -1;
4755 }
4756 else if (x == Py_None) {
4757 *result = x;
4758 return 0;
4759 }
4760 else if (PyInt_Check(x)) {
4761 long value = PyInt_AS_LONG(x);
4762 long max = PyUnicode_GetMax();
4763 if (value < 0 || value > max) {
4764 PyErr_Format(PyExc_TypeError,
4765 "character mapping must be in range(0x%lx)", max+1);
4766 Py_DECREF(x);
4767 return -1;
4768 }
4769 *result = x;
4770 return 0;
4771 }
4772 else if (PyUnicode_Check(x)) {
4773 *result = x;
4774 return 0;
4775 }
4776 else {
4777 /* wrong return value */
4778 PyErr_SetString(PyExc_TypeError,
4779 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004780 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004781 return -1;
4782 }
4783}
4784/* ensure that *outobj is at least requiredsize characters long,
4785if not reallocate and adjust various state variables.
4786Return 0 on success, -1 on error */
4787static
Walter Dörwald4894c302003-10-24 14:25:28 +00004788int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004789 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004790{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004791 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004792 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004793 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004794 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004795 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004796 if (requiredsize < 2 * oldsize)
4797 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004798 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004799 return -1;
4800 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004801 }
4802 return 0;
4803}
4804/* lookup the character, put the result in the output string and adjust
4805 various state variables. Return a new reference to the object that
4806 was put in the output buffer in *result, or Py_None, if the mapping was
4807 undefined (in which case no character was written).
4808 The called must decref result.
4809 Return 0 on success, -1 on error. */
4810static
Walter Dörwald4894c302003-10-24 14:25:28 +00004811int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004812 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004813 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004814{
Walter Dörwald4894c302003-10-24 14:25:28 +00004815 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004816 return -1;
4817 if (*res==NULL) {
4818 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004819 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004820 }
4821 else if (*res==Py_None)
4822 ;
4823 else if (PyInt_Check(*res)) {
4824 /* no overflow check, because we know that the space is enough */
4825 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4826 }
4827 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004828 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004829 if (repsize==1) {
4830 /* no overflow check, because we know that the space is enough */
4831 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4832 }
4833 else if (repsize!=0) {
4834 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004835 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004836 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004837 repsize - 1;
4838 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004839 return -1;
4840 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4841 *outp += repsize;
4842 }
4843 }
4844 else
4845 return -1;
4846 return 0;
4847}
4848
4849PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004850 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004851 PyObject *mapping,
4852 const char *errors)
4853{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004854 /* output object */
4855 PyObject *res = NULL;
4856 /* pointers to the beginning and end+1 of input */
4857 const Py_UNICODE *startp = p;
4858 const Py_UNICODE *endp = p + size;
4859 /* pointer into the output */
4860 Py_UNICODE *str;
4861 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004862 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004863 char *reason = "character maps to <undefined>";
4864 PyObject *errorHandler = NULL;
4865 PyObject *exc = NULL;
4866 /* the following variable is used for caching string comparisons
4867 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4868 * 3=ignore, 4=xmlcharrefreplace */
4869 int known_errorHandler = -1;
4870
Guido van Rossumd57fd912000-03-10 22:53:23 +00004871 if (mapping == NULL) {
4872 PyErr_BadArgument();
4873 return NULL;
4874 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004875
4876 /* allocate enough for a simple 1:1 translation without
4877 replacements, if we need more, we'll resize */
4878 res = PyUnicode_FromUnicode(NULL, size);
4879 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004880 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004881 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004882 return res;
4883 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004884
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004885 while (p<endp) {
4886 /* try to encode it */
4887 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004888 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004889 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004890 goto onError;
4891 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004892 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004893 if (x!=Py_None) /* it worked => adjust input pointer */
4894 ++p;
4895 else { /* untranslatable character */
4896 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004897 Py_ssize_t repsize;
4898 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004899 Py_UNICODE *uni2;
4900 /* startpos for collecting untranslatable chars */
4901 const Py_UNICODE *collstart = p;
4902 const Py_UNICODE *collend = p+1;
4903 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004904
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004905 /* find all untranslatable characters */
4906 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004907 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004908 goto onError;
4909 Py_XDECREF(x);
4910 if (x!=Py_None)
4911 break;
4912 ++collend;
4913 }
4914 /* cache callback name lookup
4915 * (if not done yet, i.e. it's the first error) */
4916 if (known_errorHandler==-1) {
4917 if ((errors==NULL) || (!strcmp(errors, "strict")))
4918 known_errorHandler = 1;
4919 else if (!strcmp(errors, "replace"))
4920 known_errorHandler = 2;
4921 else if (!strcmp(errors, "ignore"))
4922 known_errorHandler = 3;
4923 else if (!strcmp(errors, "xmlcharrefreplace"))
4924 known_errorHandler = 4;
4925 else
4926 known_errorHandler = 0;
4927 }
4928 switch (known_errorHandler) {
4929 case 1: /* strict */
4930 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4931 goto onError;
4932 case 2: /* replace */
4933 /* No need to check for space, this is a 1:1 replacement */
4934 for (coll = collstart; coll<collend; ++coll)
4935 *str++ = '?';
4936 /* fall through */
4937 case 3: /* ignore */
4938 p = collend;
4939 break;
4940 case 4: /* xmlcharrefreplace */
4941 /* generate replacement (temporarily (mis)uses p) */
4942 for (p = collstart; p < collend; ++p) {
4943 char buffer[2+29+1+1];
4944 char *cp;
4945 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004946 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004947 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4948 goto onError;
4949 for (cp = buffer; *cp; ++cp)
4950 *str++ = *cp;
4951 }
4952 p = collend;
4953 break;
4954 default:
4955 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4956 reason, startp, size, &exc,
4957 collstart-startp, collend-startp, &newpos);
4958 if (repunicode == NULL)
4959 goto onError;
4960 /* generate replacement */
4961 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004962 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004963 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4964 Py_DECREF(repunicode);
4965 goto onError;
4966 }
4967 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4968 *str++ = *uni2;
4969 p = startp + newpos;
4970 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004971 }
4972 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004973 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004974 /* Resize if we allocated to much */
4975 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004976 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004977 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004978 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004979 }
4980 Py_XDECREF(exc);
4981 Py_XDECREF(errorHandler);
4982 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004983
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004984 onError:
4985 Py_XDECREF(res);
4986 Py_XDECREF(exc);
4987 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004988 return NULL;
4989}
4990
4991PyObject *PyUnicode_Translate(PyObject *str,
4992 PyObject *mapping,
4993 const char *errors)
4994{
4995 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004996
Guido van Rossumd57fd912000-03-10 22:53:23 +00004997 str = PyUnicode_FromObject(str);
4998 if (str == NULL)
4999 goto onError;
5000 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
5001 PyUnicode_GET_SIZE(str),
5002 mapping,
5003 errors);
5004 Py_DECREF(str);
5005 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005006
Guido van Rossumd57fd912000-03-10 22:53:23 +00005007 onError:
5008 Py_XDECREF(str);
5009 return NULL;
5010}
Tim Petersced69f82003-09-16 20:30:58 +00005011
Guido van Rossum9e896b32000-04-05 20:11:21 +00005012/* --- Decimal Encoder ---------------------------------------------------- */
5013
5014int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005015 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00005016 char *output,
5017 const char *errors)
5018{
5019 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005020 PyObject *errorHandler = NULL;
5021 PyObject *exc = NULL;
5022 const char *encoding = "decimal";
5023 const char *reason = "invalid decimal Unicode string";
5024 /* the following variable is used for caching string comparisons
5025 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5026 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005027
5028 if (output == NULL) {
5029 PyErr_BadArgument();
5030 return -1;
5031 }
5032
5033 p = s;
5034 end = s + length;
5035 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005036 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005037 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005038 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005039 Py_ssize_t repsize;
5040 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005041 Py_UNICODE *uni2;
5042 Py_UNICODE *collstart;
5043 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005044
Guido van Rossum9e896b32000-04-05 20:11:21 +00005045 if (Py_UNICODE_ISSPACE(ch)) {
5046 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005047 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005048 continue;
5049 }
5050 decimal = Py_UNICODE_TODECIMAL(ch);
5051 if (decimal >= 0) {
5052 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005053 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005054 continue;
5055 }
Guido van Rossumba477042000-04-06 18:18:10 +00005056 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00005057 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005058 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005059 continue;
5060 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005061 /* All other characters are considered unencodable */
5062 collstart = p;
5063 collend = p+1;
5064 while (collend < end) {
5065 if ((0 < *collend && *collend < 256) ||
5066 !Py_UNICODE_ISSPACE(*collend) ||
5067 Py_UNICODE_TODECIMAL(*collend))
5068 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005069 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005070 /* cache callback name lookup
5071 * (if not done yet, i.e. it's the first error) */
5072 if (known_errorHandler==-1) {
5073 if ((errors==NULL) || (!strcmp(errors, "strict")))
5074 known_errorHandler = 1;
5075 else if (!strcmp(errors, "replace"))
5076 known_errorHandler = 2;
5077 else if (!strcmp(errors, "ignore"))
5078 known_errorHandler = 3;
5079 else if (!strcmp(errors, "xmlcharrefreplace"))
5080 known_errorHandler = 4;
5081 else
5082 known_errorHandler = 0;
5083 }
5084 switch (known_errorHandler) {
5085 case 1: /* strict */
5086 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5087 goto onError;
5088 case 2: /* replace */
5089 for (p = collstart; p < collend; ++p)
5090 *output++ = '?';
5091 /* fall through */
5092 case 3: /* ignore */
5093 p = collend;
5094 break;
5095 case 4: /* xmlcharrefreplace */
5096 /* generate replacement (temporarily (mis)uses p) */
5097 for (p = collstart; p < collend; ++p)
5098 output += sprintf(output, "&#%d;", (int)*p);
5099 p = collend;
5100 break;
5101 default:
5102 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5103 encoding, reason, s, length, &exc,
5104 collstart-s, collend-s, &newpos);
5105 if (repunicode == NULL)
5106 goto onError;
5107 /* generate replacement */
5108 repsize = PyUnicode_GET_SIZE(repunicode);
5109 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5110 Py_UNICODE ch = *uni2;
5111 if (Py_UNICODE_ISSPACE(ch))
5112 *output++ = ' ';
5113 else {
5114 decimal = Py_UNICODE_TODECIMAL(ch);
5115 if (decimal >= 0)
5116 *output++ = '0' + decimal;
5117 else if (0 < ch && ch < 256)
5118 *output++ = (char)ch;
5119 else {
5120 Py_DECREF(repunicode);
5121 raise_encode_exception(&exc, encoding,
5122 s, length, collstart-s, collend-s, reason);
5123 goto onError;
5124 }
5125 }
5126 }
5127 p = s + newpos;
5128 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005129 }
5130 }
5131 /* 0-terminate the output string */
5132 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005133 Py_XDECREF(exc);
5134 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005135 return 0;
5136
5137 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005138 Py_XDECREF(exc);
5139 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005140 return -1;
5141}
5142
Guido van Rossumd57fd912000-03-10 22:53:23 +00005143/* --- Helpers ------------------------------------------------------------ */
5144
Eric Smitha9f7d622008-02-17 19:46:49 +00005145#include "stringlib/unicodedefs.h"
Fredrik Lundh6471ee42006-05-24 14:28:11 +00005146
Facundo Batista6f7e6fb2007-11-16 19:16:15 +00005147#define FROM_UNICODE
Fredrik Lundhb9479482006-05-26 17:22:38 +00005148
Fredrik Lundha50d2012006-05-26 17:04:58 +00005149#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005150
5151#include "stringlib/count.h"
5152#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005153#include "stringlib/partition.h"
5154
Fredrik Lundhc8162812006-05-26 19:33:03 +00005155/* helper macro to fixup start/end slice values */
5156#define FIX_START_END(obj) \
5157 if (start < 0) \
5158 start += (obj)->length; \
5159 if (start < 0) \
5160 start = 0; \
5161 if (end > (obj)->length) \
5162 end = (obj)->length; \
5163 if (end < 0) \
5164 end += (obj)->length; \
5165 if (end < 0) \
5166 end = 0;
5167
Martin v. Löwis18e16552006-02-15 17:27:45 +00005168Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005169 PyObject *substr,
5170 Py_ssize_t start,
5171 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005172{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005173 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005174 PyUnicodeObject* str_obj;
5175 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005176
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005177 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5178 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005179 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005180 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5181 if (!sub_obj) {
5182 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005183 return -1;
5184 }
Tim Petersced69f82003-09-16 20:30:58 +00005185
Fredrik Lundhc8162812006-05-26 19:33:03 +00005186 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005187
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005188 result = stringlib_count(
5189 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5190 );
5191
5192 Py_DECREF(sub_obj);
5193 Py_DECREF(str_obj);
5194
Guido van Rossumd57fd912000-03-10 22:53:23 +00005195 return result;
5196}
5197
Martin v. Löwis18e16552006-02-15 17:27:45 +00005198Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005199 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005200 Py_ssize_t start,
5201 Py_ssize_t end,
5202 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005203{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005204 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005205
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005206 str = PyUnicode_FromObject(str);
5207 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005208 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005209 sub = PyUnicode_FromObject(sub);
5210 if (!sub) {
5211 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005212 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005213 }
Tim Petersced69f82003-09-16 20:30:58 +00005214
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005215 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005216 result = stringlib_find_slice(
5217 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5218 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5219 start, end
5220 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005221 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005222 result = stringlib_rfind_slice(
5223 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5224 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5225 start, end
5226 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005227
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005228 Py_DECREF(str);
5229 Py_DECREF(sub);
5230
Guido van Rossumd57fd912000-03-10 22:53:23 +00005231 return result;
5232}
5233
Tim Petersced69f82003-09-16 20:30:58 +00005234static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005235int tailmatch(PyUnicodeObject *self,
5236 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005237 Py_ssize_t start,
5238 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005239 int direction)
5240{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005241 if (substring->length == 0)
5242 return 1;
5243
Fredrik Lundhc8162812006-05-26 19:33:03 +00005244 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005245
5246 end -= substring->length;
5247 if (end < start)
5248 return 0;
5249
5250 if (direction > 0) {
5251 if (Py_UNICODE_MATCH(self, end, substring))
5252 return 1;
5253 } else {
5254 if (Py_UNICODE_MATCH(self, start, substring))
5255 return 1;
5256 }
5257
5258 return 0;
5259}
5260
Martin v. Löwis18e16552006-02-15 17:27:45 +00005261Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005262 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005263 Py_ssize_t start,
5264 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005265 int direction)
5266{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005267 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005268
Guido van Rossumd57fd912000-03-10 22:53:23 +00005269 str = PyUnicode_FromObject(str);
5270 if (str == NULL)
5271 return -1;
5272 substr = PyUnicode_FromObject(substr);
5273 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005274 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005275 return -1;
5276 }
Tim Petersced69f82003-09-16 20:30:58 +00005277
Guido van Rossumd57fd912000-03-10 22:53:23 +00005278 result = tailmatch((PyUnicodeObject *)str,
5279 (PyUnicodeObject *)substr,
5280 start, end, direction);
5281 Py_DECREF(str);
5282 Py_DECREF(substr);
5283 return result;
5284}
5285
Guido van Rossumd57fd912000-03-10 22:53:23 +00005286/* Apply fixfct filter to the Unicode object self and return a
5287 reference to the modified object */
5288
Tim Petersced69f82003-09-16 20:30:58 +00005289static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005290PyObject *fixup(PyUnicodeObject *self,
5291 int (*fixfct)(PyUnicodeObject *s))
5292{
5293
5294 PyUnicodeObject *u;
5295
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005296 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005297 if (u == NULL)
5298 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005299
5300 Py_UNICODE_COPY(u->str, self->str, self->length);
5301
Tim Peters7a29bd52001-09-12 03:03:31 +00005302 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005303 /* fixfct should return TRUE if it modified the buffer. If
5304 FALSE, return a reference to the original buffer instead
5305 (to save space, not time) */
5306 Py_INCREF(self);
5307 Py_DECREF(u);
5308 return (PyObject*) self;
5309 }
5310 return (PyObject*) u;
5311}
5312
Tim Petersced69f82003-09-16 20:30:58 +00005313static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005314int fixupper(PyUnicodeObject *self)
5315{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005316 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005317 Py_UNICODE *s = self->str;
5318 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005319
Guido van Rossumd57fd912000-03-10 22:53:23 +00005320 while (len-- > 0) {
5321 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005322
Guido van Rossumd57fd912000-03-10 22:53:23 +00005323 ch = Py_UNICODE_TOUPPER(*s);
5324 if (ch != *s) {
5325 status = 1;
5326 *s = ch;
5327 }
5328 s++;
5329 }
5330
5331 return status;
5332}
5333
Tim Petersced69f82003-09-16 20:30:58 +00005334static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005335int fixlower(PyUnicodeObject *self)
5336{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005337 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005338 Py_UNICODE *s = self->str;
5339 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005340
Guido van Rossumd57fd912000-03-10 22:53:23 +00005341 while (len-- > 0) {
5342 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005343
Guido van Rossumd57fd912000-03-10 22:53:23 +00005344 ch = Py_UNICODE_TOLOWER(*s);
5345 if (ch != *s) {
5346 status = 1;
5347 *s = ch;
5348 }
5349 s++;
5350 }
5351
5352 return status;
5353}
5354
Tim Petersced69f82003-09-16 20:30:58 +00005355static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005356int fixswapcase(PyUnicodeObject *self)
5357{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005358 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005359 Py_UNICODE *s = self->str;
5360 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005361
Guido van Rossumd57fd912000-03-10 22:53:23 +00005362 while (len-- > 0) {
5363 if (Py_UNICODE_ISUPPER(*s)) {
5364 *s = Py_UNICODE_TOLOWER(*s);
5365 status = 1;
5366 } else if (Py_UNICODE_ISLOWER(*s)) {
5367 *s = Py_UNICODE_TOUPPER(*s);
5368 status = 1;
5369 }
5370 s++;
5371 }
5372
5373 return status;
5374}
5375
Tim Petersced69f82003-09-16 20:30:58 +00005376static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005377int fixcapitalize(PyUnicodeObject *self)
5378{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005379 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005380 Py_UNICODE *s = self->str;
5381 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005382
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005383 if (len == 0)
5384 return 0;
5385 if (Py_UNICODE_ISLOWER(*s)) {
5386 *s = Py_UNICODE_TOUPPER(*s);
5387 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005388 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005389 s++;
5390 while (--len > 0) {
5391 if (Py_UNICODE_ISUPPER(*s)) {
5392 *s = Py_UNICODE_TOLOWER(*s);
5393 status = 1;
5394 }
5395 s++;
5396 }
5397 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005398}
5399
5400static
5401int fixtitle(PyUnicodeObject *self)
5402{
5403 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5404 register Py_UNICODE *e;
5405 int previous_is_cased;
5406
5407 /* Shortcut for single character strings */
5408 if (PyUnicode_GET_SIZE(self) == 1) {
5409 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5410 if (*p != ch) {
5411 *p = ch;
5412 return 1;
5413 }
5414 else
5415 return 0;
5416 }
Tim Petersced69f82003-09-16 20:30:58 +00005417
Guido van Rossumd57fd912000-03-10 22:53:23 +00005418 e = p + PyUnicode_GET_SIZE(self);
5419 previous_is_cased = 0;
5420 for (; p < e; p++) {
5421 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005422
Guido van Rossumd57fd912000-03-10 22:53:23 +00005423 if (previous_is_cased)
5424 *p = Py_UNICODE_TOLOWER(ch);
5425 else
5426 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005427
5428 if (Py_UNICODE_ISLOWER(ch) ||
5429 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005430 Py_UNICODE_ISTITLE(ch))
5431 previous_is_cased = 1;
5432 else
5433 previous_is_cased = 0;
5434 }
5435 return 1;
5436}
5437
Tim Peters8ce9f162004-08-27 01:49:32 +00005438PyObject *
5439PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005440{
Tim Peters8ce9f162004-08-27 01:49:32 +00005441 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005442 const Py_UNICODE blank = ' ';
5443 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00005444 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005445 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00005446 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5447 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005448 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5449 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005450 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005451 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005452 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005453
Tim Peters05eba1f2004-08-27 21:32:02 +00005454 fseq = PySequence_Fast(seq, "");
5455 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005456 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005457 }
5458
Tim Peters91879ab2004-08-27 22:35:44 +00005459 /* Grrrr. A codec may be invoked to convert str objects to
5460 * Unicode, and so it's possible to call back into Python code
5461 * during PyUnicode_FromObject(), and so it's possible for a sick
5462 * codec to change the size of fseq (if seq is a list). Therefore
5463 * we have to keep refetching the size -- can't assume seqlen
5464 * is invariant.
5465 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005466 seqlen = PySequence_Fast_GET_SIZE(fseq);
5467 /* If empty sequence, return u"". */
5468 if (seqlen == 0) {
5469 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5470 goto Done;
5471 }
5472 /* If singleton sequence with an exact Unicode, return that. */
5473 if (seqlen == 1) {
5474 item = PySequence_Fast_GET_ITEM(fseq, 0);
5475 if (PyUnicode_CheckExact(item)) {
5476 Py_INCREF(item);
5477 res = (PyUnicodeObject *)item;
5478 goto Done;
5479 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005480 }
5481
Tim Peters05eba1f2004-08-27 21:32:02 +00005482 /* At least two items to join, or one that isn't exact Unicode. */
5483 if (seqlen > 1) {
5484 /* Set up sep and seplen -- they're needed. */
5485 if (separator == NULL) {
5486 sep = &blank;
5487 seplen = 1;
5488 }
5489 else {
5490 internal_separator = PyUnicode_FromObject(separator);
5491 if (internal_separator == NULL)
5492 goto onError;
5493 sep = PyUnicode_AS_UNICODE(internal_separator);
5494 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00005495 /* In case PyUnicode_FromObject() mutated seq. */
5496 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005497 }
5498 }
5499
5500 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005501 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005502 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005503 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005504 res_p = PyUnicode_AS_UNICODE(res);
5505 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005506
Tim Peters05eba1f2004-08-27 21:32:02 +00005507 for (i = 0; i < seqlen; ++i) {
Tim Peters286085c2006-05-22 19:17:04 +00005508 Py_ssize_t itemlen;
5509 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005510
5511 item = PySequence_Fast_GET_ITEM(fseq, i);
5512 /* Convert item to Unicode. */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00005513 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005514 PyErr_Format(PyExc_TypeError,
Thomas Wouters715a4cd2006-04-16 22:04:49 +00005515 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00005516 " %.80s found",
Christian Heimese93237d2007-12-19 02:37:44 +00005517 i, Py_TYPE(item)->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00005518 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005519 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005520 item = PyUnicode_FromObject(item);
5521 if (item == NULL)
5522 goto onError;
5523 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005524
Tim Peters91879ab2004-08-27 22:35:44 +00005525 /* In case PyUnicode_FromObject() mutated seq. */
5526 seqlen = PySequence_Fast_GET_SIZE(fseq);
5527
Tim Peters8ce9f162004-08-27 01:49:32 +00005528 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005529 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005530 new_res_used = res_used + itemlen;
Georg Brandl90e27d32006-06-10 06:40:50 +00005531 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005532 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005533 if (i < seqlen - 1) {
5534 new_res_used += seplen;
Georg Brandl90e27d32006-06-10 06:40:50 +00005535 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00005536 goto Overflow;
5537 }
5538 if (new_res_used > res_alloc) {
5539 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00005540 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00005541 res_alloc += res_alloc;
Tim Peters286085c2006-05-22 19:17:04 +00005542 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005543 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005544 } while (new_res_used > res_alloc);
Martin v. Löwis412fb672006-04-13 06:34:32 +00005545 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005546 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005547 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005548 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005549 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005550 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005551
5552 /* Copy item, and maybe the separator. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005553 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005554 res_p += itemlen;
5555 if (i < seqlen - 1) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00005556 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005557 res_p += seplen;
5558 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005559 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005560 res_used = new_res_used;
5561 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005562
Tim Peters05eba1f2004-08-27 21:32:02 +00005563 /* Shrink res to match the used area; this probably can't fail,
5564 * but it's cheap to check.
5565 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005566 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005567 goto onError;
5568
5569 Done:
5570 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005571 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005572 return (PyObject *)res;
5573
Tim Peters8ce9f162004-08-27 01:49:32 +00005574 Overflow:
5575 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00005576 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005577 Py_DECREF(item);
5578 /* fall through */
5579
Guido van Rossumd57fd912000-03-10 22:53:23 +00005580 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005581 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005582 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005583 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005584 return NULL;
5585}
5586
Tim Petersced69f82003-09-16 20:30:58 +00005587static
5588PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005589 Py_ssize_t left,
5590 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005591 Py_UNICODE fill)
5592{
5593 PyUnicodeObject *u;
5594
5595 if (left < 0)
5596 left = 0;
5597 if (right < 0)
5598 right = 0;
5599
Tim Peters7a29bd52001-09-12 03:03:31 +00005600 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005601 Py_INCREF(self);
5602 return self;
5603 }
5604
Neal Norwitze7d8be82008-07-31 17:17:14 +00005605 if (left > PY_SSIZE_T_MAX - self->length ||
5606 right > PY_SSIZE_T_MAX - (left + self->length)) {
5607 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5608 return NULL;
5609 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005610 u = _PyUnicode_New(left + self->length + right);
5611 if (u) {
5612 if (left)
5613 Py_UNICODE_FILL(u->str, fill, left);
5614 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5615 if (right)
5616 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5617 }
5618
5619 return u;
5620}
5621
5622#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005623 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005624 if (!str) \
5625 goto onError; \
5626 if (PyList_Append(list, str)) { \
5627 Py_DECREF(str); \
5628 goto onError; \
5629 } \
5630 else \
5631 Py_DECREF(str);
5632
5633static
5634PyObject *split_whitespace(PyUnicodeObject *self,
5635 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005636 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005637{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005638 register Py_ssize_t i;
5639 register Py_ssize_t j;
5640 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005641 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005642 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005643
5644 for (i = j = 0; i < len; ) {
5645 /* find a token */
Christian Heimes4d4f2702008-01-30 11:32:37 +00005646 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005647 i++;
5648 j = i;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005649 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005650 i++;
5651 if (j < i) {
5652 if (maxcount-- <= 0)
5653 break;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005654 SPLIT_APPEND(buf, j, i);
5655 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005656 i++;
5657 j = i;
5658 }
5659 }
5660 if (j < len) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005661 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005662 }
5663 return list;
5664
5665 onError:
5666 Py_DECREF(list);
5667 return NULL;
5668}
5669
5670PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005671 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005672{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005673 register Py_ssize_t i;
5674 register Py_ssize_t j;
5675 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005676 PyObject *list;
5677 PyObject *str;
5678 Py_UNICODE *data;
5679
5680 string = PyUnicode_FromObject(string);
5681 if (string == NULL)
5682 return NULL;
5683 data = PyUnicode_AS_UNICODE(string);
5684 len = PyUnicode_GET_SIZE(string);
5685
Guido van Rossumd57fd912000-03-10 22:53:23 +00005686 list = PyList_New(0);
5687 if (!list)
5688 goto onError;
5689
5690 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005691 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005692
Guido van Rossumd57fd912000-03-10 22:53:23 +00005693 /* Find a line and append it */
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005694 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005695 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005696
5697 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005698 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699 if (i < len) {
5700 if (data[i] == '\r' && i + 1 < len &&
5701 data[i+1] == '\n')
5702 i += 2;
5703 else
5704 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005705 if (keepends)
5706 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005707 }
Guido van Rossum86662912000-04-11 15:38:46 +00005708 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005709 j = i;
5710 }
5711 if (j < len) {
5712 SPLIT_APPEND(data, j, len);
5713 }
5714
5715 Py_DECREF(string);
5716 return list;
5717
5718 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005719 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005720 Py_DECREF(string);
5721 return NULL;
5722}
5723
Tim Petersced69f82003-09-16 20:30:58 +00005724static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005725PyObject *split_char(PyUnicodeObject *self,
5726 PyObject *list,
5727 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005728 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005729{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005730 register Py_ssize_t i;
5731 register Py_ssize_t j;
5732 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005733 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005734 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005735
5736 for (i = j = 0; i < len; ) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005737 if (buf[i] == ch) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005738 if (maxcount-- <= 0)
5739 break;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005740 SPLIT_APPEND(buf, j, i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005741 i = j = i + 1;
5742 } else
5743 i++;
5744 }
5745 if (j <= len) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005746 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005747 }
5748 return list;
5749
5750 onError:
5751 Py_DECREF(list);
5752 return NULL;
5753}
5754
Tim Petersced69f82003-09-16 20:30:58 +00005755static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005756PyObject *split_substring(PyUnicodeObject *self,
5757 PyObject *list,
5758 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005759 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005760{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005761 register Py_ssize_t i;
5762 register Py_ssize_t j;
5763 Py_ssize_t len = self->length;
5764 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005765 PyObject *str;
5766
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005767 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005768 if (Py_UNICODE_MATCH(self, i, substring)) {
5769 if (maxcount-- <= 0)
5770 break;
5771 SPLIT_APPEND(self->str, j, i);
5772 i = j = i + sublen;
5773 } else
5774 i++;
5775 }
5776 if (j <= len) {
5777 SPLIT_APPEND(self->str, j, len);
5778 }
5779 return list;
5780
5781 onError:
5782 Py_DECREF(list);
5783 return NULL;
5784}
5785
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005786static
5787PyObject *rsplit_whitespace(PyUnicodeObject *self,
5788 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005789 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005790{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005791 register Py_ssize_t i;
5792 register Py_ssize_t j;
5793 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005794 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005795 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005796
5797 for (i = j = len - 1; i >= 0; ) {
5798 /* find a token */
Christian Heimes4d4f2702008-01-30 11:32:37 +00005799 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005800 i--;
5801 j = i;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005802 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005803 i--;
5804 if (j > i) {
5805 if (maxcount-- <= 0)
5806 break;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005807 SPLIT_APPEND(buf, i + 1, j + 1);
5808 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005809 i--;
5810 j = i;
5811 }
5812 }
5813 if (j >= 0) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005814 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005815 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005816 if (PyList_Reverse(list) < 0)
5817 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005818 return list;
5819
5820 onError:
5821 Py_DECREF(list);
5822 return NULL;
5823}
5824
5825static
5826PyObject *rsplit_char(PyUnicodeObject *self,
5827 PyObject *list,
5828 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005829 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005830{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005831 register Py_ssize_t i;
5832 register Py_ssize_t j;
5833 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005834 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005835 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005836
5837 for (i = j = len - 1; i >= 0; ) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005838 if (buf[i] == ch) {
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005839 if (maxcount-- <= 0)
5840 break;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005841 SPLIT_APPEND(buf, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005842 j = i = i - 1;
5843 } else
5844 i--;
5845 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005846 if (j >= -1) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005847 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005848 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005849 if (PyList_Reverse(list) < 0)
5850 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005851 return list;
5852
5853 onError:
5854 Py_DECREF(list);
5855 return NULL;
5856}
5857
5858static
5859PyObject *rsplit_substring(PyUnicodeObject *self,
5860 PyObject *list,
5861 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005862 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005863{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005864 register Py_ssize_t i;
5865 register Py_ssize_t j;
5866 Py_ssize_t len = self->length;
5867 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005868 PyObject *str;
5869
5870 for (i = len - sublen, j = len; i >= 0; ) {
5871 if (Py_UNICODE_MATCH(self, i, substring)) {
5872 if (maxcount-- <= 0)
5873 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005874 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005875 j = i;
5876 i -= sublen;
5877 } else
5878 i--;
5879 }
5880 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005881 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005882 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005883 if (PyList_Reverse(list) < 0)
5884 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005885 return list;
5886
5887 onError:
5888 Py_DECREF(list);
5889 return NULL;
5890}
5891
Guido van Rossumd57fd912000-03-10 22:53:23 +00005892#undef SPLIT_APPEND
5893
5894static
5895PyObject *split(PyUnicodeObject *self,
5896 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005897 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005898{
5899 PyObject *list;
5900
5901 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005902 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005903
5904 list = PyList_New(0);
5905 if (!list)
5906 return NULL;
5907
5908 if (substring == NULL)
5909 return split_whitespace(self,list,maxcount);
5910
5911 else if (substring->length == 1)
5912 return split_char(self,list,substring->str[0],maxcount);
5913
5914 else if (substring->length == 0) {
5915 Py_DECREF(list);
5916 PyErr_SetString(PyExc_ValueError, "empty separator");
5917 return NULL;
5918 }
5919 else
5920 return split_substring(self,list,substring,maxcount);
5921}
5922
Tim Petersced69f82003-09-16 20:30:58 +00005923static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005924PyObject *rsplit(PyUnicodeObject *self,
5925 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005926 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005927{
5928 PyObject *list;
5929
5930 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005931 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005932
5933 list = PyList_New(0);
5934 if (!list)
5935 return NULL;
5936
5937 if (substring == NULL)
5938 return rsplit_whitespace(self,list,maxcount);
5939
5940 else if (substring->length == 1)
5941 return rsplit_char(self,list,substring->str[0],maxcount);
5942
5943 else if (substring->length == 0) {
5944 Py_DECREF(list);
5945 PyErr_SetString(PyExc_ValueError, "empty separator");
5946 return NULL;
5947 }
5948 else
5949 return rsplit_substring(self,list,substring,maxcount);
5950}
5951
5952static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005953PyObject *replace(PyUnicodeObject *self,
5954 PyUnicodeObject *str1,
5955 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005956 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005957{
5958 PyUnicodeObject *u;
5959
5960 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005961 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005962
Fredrik Lundh347ee272006-05-24 16:35:18 +00005963 if (str1->length == str2->length) {
5964 /* same length */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005965 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005966 if (str1->length == 1) {
5967 /* replace characters */
5968 Py_UNICODE u1, u2;
5969 if (!findchar(self->str, self->length, str1->str[0]))
5970 goto nothing;
5971 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5972 if (!u)
5973 return NULL;
5974 Py_UNICODE_COPY(u->str, self->str, self->length);
5975 u1 = str1->str[0];
5976 u2 = str2->str[0];
5977 for (i = 0; i < u->length; i++)
5978 if (u->str[i] == u1) {
5979 if (--maxcount < 0)
5980 break;
5981 u->str[i] = u2;
5982 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005983 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005984 i = fastsearch(
5985 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005986 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00005987 if (i < 0)
5988 goto nothing;
5989 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5990 if (!u)
5991 return NULL;
5992 Py_UNICODE_COPY(u->str, self->str, self->length);
5993 while (i <= self->length - str1->length)
5994 if (Py_UNICODE_MATCH(self, i, str1)) {
5995 if (--maxcount < 0)
5996 break;
5997 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5998 i += str1->length;
5999 } else
6000 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006001 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006002 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00006003
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006004 Py_ssize_t n, i, j, e;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00006005 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006006 Py_UNICODE *p;
6007
6008 /* replace strings */
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006009 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006010 if (n > maxcount)
6011 n = maxcount;
Fredrik Lundh347ee272006-05-24 16:35:18 +00006012 if (n == 0)
6013 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00006014 /* new_size = self->length + n * (str2->length - str1->length)); */
6015 delta = (str2->length - str1->length);
6016 if (delta == 0) {
6017 new_size = self->length;
6018 } else {
6019 product = n * (str2->length - str1->length);
6020 if ((product / (str2->length - str1->length)) != n) {
6021 PyErr_SetString(PyExc_OverflowError,
6022 "replace string is too long");
6023 return NULL;
6024 }
6025 new_size = self->length + product;
6026 if (new_size < 0) {
6027 PyErr_SetString(PyExc_OverflowError,
6028 "replace string is too long");
6029 return NULL;
6030 }
6031 }
6032 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00006033 if (!u)
6034 return NULL;
6035 i = 0;
6036 p = u->str;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006037 e = self->length - str1->length;
Fredrik Lundh347ee272006-05-24 16:35:18 +00006038 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006039 while (n-- > 0) {
6040 /* look for next match */
6041 j = i;
6042 while (j <= e) {
6043 if (Py_UNICODE_MATCH(self, j, str1))
6044 break;
6045 j++;
6046 }
6047 if (j > i) {
6048 if (j > e)
6049 break;
6050 /* copy unchanged part [i:j] */
6051 Py_UNICODE_COPY(p, self->str+i, j-i);
6052 p += j - i;
6053 }
6054 /* copy substitution string */
6055 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00006056 Py_UNICODE_COPY(p, str2->str, str2->length);
6057 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006058 }
6059 i = j + str1->length;
6060 }
6061 if (i < self->length)
6062 /* copy tail [i:] */
6063 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00006064 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006065 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00006066 while (n > 0) {
6067 Py_UNICODE_COPY(p, str2->str, str2->length);
6068 p += str2->length;
6069 if (--n <= 0)
6070 break;
6071 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006072 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00006073 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006074 }
6075 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006076 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00006077
6078nothing:
6079 /* nothing to replace; return original string (when possible) */
6080 if (PyUnicode_CheckExact(self)) {
6081 Py_INCREF(self);
6082 return (PyObject *) self;
6083 }
6084 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006085}
6086
6087/* --- Unicode Object Methods --------------------------------------------- */
6088
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006089PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006090"S.title() -> unicode\n\
6091\n\
6092Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006093characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006094
6095static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006096unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006097{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006098 return fixup(self, fixtitle);
6099}
6100
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006101PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006102"S.capitalize() -> unicode\n\
6103\n\
6104Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006105have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006106
6107static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006108unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006109{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006110 return fixup(self, fixcapitalize);
6111}
6112
6113#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006114PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006115"S.capwords() -> unicode\n\
6116\n\
6117Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006118normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006119
6120static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006121unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006122{
6123 PyObject *list;
6124 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006125 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006126
Guido van Rossumd57fd912000-03-10 22:53:23 +00006127 /* Split into words */
6128 list = split(self, NULL, -1);
6129 if (!list)
6130 return NULL;
6131
6132 /* Capitalize each word */
6133 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6134 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6135 fixcapitalize);
6136 if (item == NULL)
6137 goto onError;
6138 Py_DECREF(PyList_GET_ITEM(list, i));
6139 PyList_SET_ITEM(list, i, item);
6140 }
6141
6142 /* Join the words to form a new string */
6143 item = PyUnicode_Join(NULL, list);
6144
6145onError:
6146 Py_DECREF(list);
6147 return (PyObject *)item;
6148}
6149#endif
6150
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006151/* Argument converter. Coerces to a single unicode character */
6152
6153static int
6154convert_uc(PyObject *obj, void *addr)
6155{
6156 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6157 PyObject *uniobj;
6158 Py_UNICODE *unistr;
6159
6160 uniobj = PyUnicode_FromObject(obj);
6161 if (uniobj == NULL) {
6162 PyErr_SetString(PyExc_TypeError,
6163 "The fill character cannot be converted to Unicode");
6164 return 0;
6165 }
6166 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6167 PyErr_SetString(PyExc_TypeError,
6168 "The fill character must be exactly one character long");
6169 Py_DECREF(uniobj);
6170 return 0;
6171 }
6172 unistr = PyUnicode_AS_UNICODE(uniobj);
6173 *fillcharloc = unistr[0];
6174 Py_DECREF(uniobj);
6175 return 1;
6176}
6177
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006178PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006179"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006180\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006181Return S centered in a Unicode string of length width. Padding is\n\
6182done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006183
6184static PyObject *
6185unicode_center(PyUnicodeObject *self, PyObject *args)
6186{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006187 Py_ssize_t marg, left;
6188 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006189 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006190
Thomas Woutersde017742006-02-16 19:34:37 +00006191 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006192 return NULL;
6193
Tim Peters7a29bd52001-09-12 03:03:31 +00006194 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006195 Py_INCREF(self);
6196 return (PyObject*) self;
6197 }
6198
6199 marg = width - self->length;
6200 left = marg / 2 + (marg & width & 1);
6201
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006202 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006203}
6204
Marc-André Lemburge5034372000-08-08 08:04:29 +00006205#if 0
6206
6207/* This code should go into some future Unicode collation support
6208 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006209 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006210
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006211/* speedy UTF-16 code point order comparison */
6212/* gleaned from: */
6213/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6214
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006215static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006216{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006217 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006218 0, 0, 0, 0, 0, 0, 0, 0,
6219 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006220 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006221};
6222
Guido van Rossumd57fd912000-03-10 22:53:23 +00006223static int
6224unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6225{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006226 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006227
Guido van Rossumd57fd912000-03-10 22:53:23 +00006228 Py_UNICODE *s1 = str1->str;
6229 Py_UNICODE *s2 = str2->str;
6230
6231 len1 = str1->length;
6232 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006233
Guido van Rossumd57fd912000-03-10 22:53:23 +00006234 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006235 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006236
6237 c1 = *s1++;
6238 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006239
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006240 if (c1 > (1<<11) * 26)
6241 c1 += utf16Fixup[c1>>11];
6242 if (c2 > (1<<11) * 26)
6243 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006244 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006245
6246 if (c1 != c2)
6247 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006248
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006249 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006250 }
6251
6252 return (len1 < len2) ? -1 : (len1 != len2);
6253}
6254
Marc-André Lemburge5034372000-08-08 08:04:29 +00006255#else
6256
6257static int
6258unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6259{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006260 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006261
6262 Py_UNICODE *s1 = str1->str;
6263 Py_UNICODE *s2 = str2->str;
6264
6265 len1 = str1->length;
6266 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006267
Marc-André Lemburge5034372000-08-08 08:04:29 +00006268 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006269 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006270
Fredrik Lundh45714e92001-06-26 16:39:36 +00006271 c1 = *s1++;
6272 c2 = *s2++;
6273
6274 if (c1 != c2)
6275 return (c1 < c2) ? -1 : 1;
6276
Marc-André Lemburge5034372000-08-08 08:04:29 +00006277 len1--; len2--;
6278 }
6279
6280 return (len1 < len2) ? -1 : (len1 != len2);
6281}
6282
6283#endif
6284
Guido van Rossumd57fd912000-03-10 22:53:23 +00006285int PyUnicode_Compare(PyObject *left,
6286 PyObject *right)
6287{
6288 PyUnicodeObject *u = NULL, *v = NULL;
6289 int result;
6290
6291 /* Coerce the two arguments */
6292 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6293 if (u == NULL)
6294 goto onError;
6295 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6296 if (v == NULL)
6297 goto onError;
6298
Thomas Wouters7e474022000-07-16 12:04:32 +00006299 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006300 if (v == u) {
6301 Py_DECREF(u);
6302 Py_DECREF(v);
6303 return 0;
6304 }
6305
6306 result = unicode_compare(u, v);
6307
6308 Py_DECREF(u);
6309 Py_DECREF(v);
6310 return result;
6311
6312onError:
6313 Py_XDECREF(u);
6314 Py_XDECREF(v);
6315 return -1;
6316}
6317
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006318PyObject *PyUnicode_RichCompare(PyObject *left,
6319 PyObject *right,
6320 int op)
6321{
6322 int result;
6323
6324 result = PyUnicode_Compare(left, right);
6325 if (result == -1 && PyErr_Occurred())
6326 goto onError;
6327
6328 /* Convert the return value to a Boolean */
6329 switch (op) {
6330 case Py_EQ:
6331 result = (result == 0);
6332 break;
6333 case Py_NE:
6334 result = (result != 0);
6335 break;
6336 case Py_LE:
6337 result = (result <= 0);
6338 break;
6339 case Py_GE:
6340 result = (result >= 0);
6341 break;
6342 case Py_LT:
6343 result = (result == -1);
6344 break;
6345 case Py_GT:
6346 result = (result == 1);
6347 break;
6348 }
6349 return PyBool_FromLong(result);
6350
6351 onError:
6352
6353 /* Standard case
6354
6355 Type errors mean that PyUnicode_FromObject() could not convert
6356 one of the arguments (usually the right hand side) to Unicode,
6357 ie. we can't handle the comparison request. However, it is
6358 possible that the other object knows a comparison method, which
6359 is why we return Py_NotImplemented to give the other object a
6360 chance.
6361
6362 */
6363 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6364 PyErr_Clear();
6365 Py_INCREF(Py_NotImplemented);
6366 return Py_NotImplemented;
6367 }
6368 if (op != Py_EQ && op != Py_NE)
6369 return NULL;
6370
6371 /* Equality comparison.
6372
6373 This is a special case: we silence any PyExc_UnicodeDecodeError
6374 and instead turn it into a PyErr_UnicodeWarning.
6375
6376 */
6377 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6378 return NULL;
6379 PyErr_Clear();
6380 if (PyErr_Warn(PyExc_UnicodeWarning,
6381 (op == Py_EQ) ?
6382 "Unicode equal comparison "
6383 "failed to convert both arguments to Unicode - "
6384 "interpreting them as being unequal" :
6385 "Unicode unequal comparison "
6386 "failed to convert both arguments to Unicode - "
6387 "interpreting them as being unequal"
6388 ) < 0)
6389 return NULL;
6390 result = (op == Py_NE);
6391 return PyBool_FromLong(result);
6392}
6393
Guido van Rossum403d68b2000-03-13 15:55:09 +00006394int PyUnicode_Contains(PyObject *container,
6395 PyObject *element)
6396{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006397 PyObject *str, *sub;
6398 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006399
6400 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006401 sub = PyUnicode_FromObject(element);
6402 if (!sub) {
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006403 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00006404 "'in <string>' requires string as left operand");
Fredrik Lundh833bf942006-05-23 10:12:21 +00006405 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006406 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00006407
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006408 str = PyUnicode_FromObject(container);
6409 if (!str) {
6410 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00006411 return -1;
6412 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00006413
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006414 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00006415
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006416 Py_DECREF(str);
6417 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00006418
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006419 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006420}
6421
Guido van Rossumd57fd912000-03-10 22:53:23 +00006422/* Concat to string or Unicode object giving a new Unicode object. */
6423
6424PyObject *PyUnicode_Concat(PyObject *left,
6425 PyObject *right)
6426{
6427 PyUnicodeObject *u = NULL, *v = NULL, *w;
6428
6429 /* Coerce the two arguments */
6430 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6431 if (u == NULL)
6432 goto onError;
6433 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6434 if (v == NULL)
6435 goto onError;
6436
6437 /* Shortcuts */
6438 if (v == unicode_empty) {
6439 Py_DECREF(v);
6440 return (PyObject *)u;
6441 }
6442 if (u == unicode_empty) {
6443 Py_DECREF(u);
6444 return (PyObject *)v;
6445 }
6446
6447 /* Concat the two Unicode strings */
6448 w = _PyUnicode_New(u->length + v->length);
6449 if (w == NULL)
6450 goto onError;
6451 Py_UNICODE_COPY(w->str, u->str, u->length);
6452 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6453
6454 Py_DECREF(u);
6455 Py_DECREF(v);
6456 return (PyObject *)w;
6457
6458onError:
6459 Py_XDECREF(u);
6460 Py_XDECREF(v);
6461 return NULL;
6462}
6463
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006464PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006465"S.count(sub[, start[, end]]) -> int\n\
6466\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00006467Return the number of non-overlapping occurrences of substring sub in\n\
6468Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006469interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006470
6471static PyObject *
6472unicode_count(PyUnicodeObject *self, PyObject *args)
6473{
6474 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006475 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006476 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006477 PyObject *result;
6478
Guido van Rossumb8872e62000-05-09 14:14:27 +00006479 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6480 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006481 return NULL;
6482
6483 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006484 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006485 if (substring == NULL)
6486 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006487
Fredrik Lundhc8162812006-05-26 19:33:03 +00006488 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006489
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006490 result = PyInt_FromSsize_t(
6491 stringlib_count(self->str + start, end - start,
6492 substring->str, substring->length)
6493 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006494
6495 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006496
Guido van Rossumd57fd912000-03-10 22:53:23 +00006497 return result;
6498}
6499
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006500PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006501"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006502\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006503Encodes S using the codec registered for encoding. encoding defaults\n\
6504to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006505handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006506a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6507'xmlcharrefreplace' as well as any other name registered with\n\
6508codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006509
6510static PyObject *
6511unicode_encode(PyUnicodeObject *self, PyObject *args)
6512{
6513 char *encoding = NULL;
6514 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006515 PyObject *v;
6516
Guido van Rossumd57fd912000-03-10 22:53:23 +00006517 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6518 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006519 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006520 if (v == NULL)
6521 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006522 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006523 PyErr_Format(PyExc_TypeError,
6524 "encoder did not return a string/unicode object "
6525 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006526 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006527 Py_DECREF(v);
6528 return NULL;
6529 }
6530 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006531
6532 onError:
6533 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006534}
6535
6536PyDoc_STRVAR(decode__doc__,
6537"S.decode([encoding[,errors]]) -> string or unicode\n\
6538\n\
6539Decodes S using the codec registered for encoding. encoding defaults\n\
6540to the default encoding. errors may be given to set a different error\n\
6541handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6542a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6543as well as any other name registerd with codecs.register_error that is\n\
6544able to handle UnicodeDecodeErrors.");
6545
6546static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00006547unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006548{
6549 char *encoding = NULL;
6550 char *errors = NULL;
6551 PyObject *v;
6552
6553 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
6554 return NULL;
6555 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006556 if (v == NULL)
6557 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006558 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006559 PyErr_Format(PyExc_TypeError,
6560 "decoder did not return a string/unicode object "
6561 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006562 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006563 Py_DECREF(v);
6564 return NULL;
6565 }
6566 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006567
6568 onError:
6569 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006570}
6571
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006572PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006573"S.expandtabs([tabsize]) -> unicode\n\
6574\n\
6575Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006576If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006577
6578static PyObject*
6579unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6580{
6581 Py_UNICODE *e;
6582 Py_UNICODE *p;
6583 Py_UNICODE *q;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006584 Py_UNICODE *qe;
6585 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006586 PyUnicodeObject *u;
6587 int tabsize = 8;
6588
6589 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6590 return NULL;
6591
Thomas Wouters7e474022000-07-16 12:04:32 +00006592 /* First pass: determine size of output string */
Guido van Rossum5bdff602008-03-11 21:18:06 +00006593 i = 0; /* chars up to and including most recent \n or \r */
6594 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6595 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006596 for (p = self->str; p < e; p++)
6597 if (*p == '\t') {
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006598 if (tabsize > 0) {
Guido van Rossum5bdff602008-03-11 21:18:06 +00006599 incr = tabsize - (j % tabsize); /* cannot overflow */
6600 if (j > PY_SSIZE_T_MAX - incr)
6601 goto overflow1;
6602 j += incr;
6603 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006604 }
6605 else {
Guido van Rossum5bdff602008-03-11 21:18:06 +00006606 if (j > PY_SSIZE_T_MAX - 1)
6607 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006608 j++;
6609 if (*p == '\n' || *p == '\r') {
Guido van Rossum5bdff602008-03-11 21:18:06 +00006610 if (i > PY_SSIZE_T_MAX - j)
6611 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006612 i += j;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006613 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006614 }
6615 }
6616
Guido van Rossum5bdff602008-03-11 21:18:06 +00006617 if (i > PY_SSIZE_T_MAX - j)
6618 goto overflow1;
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006619
Guido van Rossumd57fd912000-03-10 22:53:23 +00006620 /* Second pass: create output string and fill it */
6621 u = _PyUnicode_New(i + j);
6622 if (!u)
6623 return NULL;
6624
Guido van Rossum5bdff602008-03-11 21:18:06 +00006625 j = 0; /* same as in first pass */
6626 q = u->str; /* next output char */
6627 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006628
6629 for (p = self->str; p < e; p++)
6630 if (*p == '\t') {
6631 if (tabsize > 0) {
6632 i = tabsize - (j % tabsize);
6633 j += i;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006634 while (i--) {
6635 if (q >= qe)
6636 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006637 *q++ = ' ';
Guido van Rossum5bdff602008-03-11 21:18:06 +00006638 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006639 }
6640 }
6641 else {
Guido van Rossum5bdff602008-03-11 21:18:06 +00006642 if (q >= qe)
6643 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006644 *q++ = *p;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006645 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006646 if (*p == '\n' || *p == '\r')
6647 j = 0;
6648 }
6649
6650 return (PyObject*) u;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006651
6652 overflow2:
6653 Py_DECREF(u);
6654 overflow1:
6655 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6656 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006657}
6658
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006659PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006660"S.find(sub [,start [,end]]) -> int\n\
6661\n\
6662Return the lowest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00006663such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006664arguments start and end are interpreted as in slice notation.\n\
6665\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006666Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006667
6668static PyObject *
6669unicode_find(PyUnicodeObject *self, PyObject *args)
6670{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006671 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006672 Py_ssize_t start;
6673 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006674 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006675
Facundo Batista57d56692007-11-16 18:04:14 +00006676 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006677 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006678
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006679 result = stringlib_find_slice(
6680 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6681 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6682 start, end
6683 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006684
6685 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006686
6687 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006688}
6689
6690static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006691unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006692{
6693 if (index < 0 || index >= self->length) {
6694 PyErr_SetString(PyExc_IndexError, "string index out of range");
6695 return NULL;
6696 }
6697
6698 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6699}
6700
6701static long
6702unicode_hash(PyUnicodeObject *self)
6703{
Fredrik Lundhdde61642000-07-10 18:27:47 +00006704 /* Since Unicode objects compare equal to their ASCII string
6705 counterparts, they should use the individual character values
6706 as basis for their hash value. This is needed to assure that
6707 strings and Unicode objects behave in the same way as
6708 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006709
Martin v. Löwis18e16552006-02-15 17:27:45 +00006710 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006711 register Py_UNICODE *p;
6712 register long x;
6713
Guido van Rossumd57fd912000-03-10 22:53:23 +00006714 if (self->hash != -1)
6715 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006716 len = PyUnicode_GET_SIZE(self);
6717 p = PyUnicode_AS_UNICODE(self);
6718 x = *p << 7;
6719 while (--len >= 0)
6720 x = (1000003*x) ^ *p++;
6721 x ^= PyUnicode_GET_SIZE(self);
6722 if (x == -1)
6723 x = -2;
6724 self->hash = x;
6725 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006726}
6727
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006728PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006729"S.index(sub [,start [,end]]) -> int\n\
6730\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006731Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006732
6733static PyObject *
6734unicode_index(PyUnicodeObject *self, PyObject *args)
6735{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006736 Py_ssize_t result;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006737 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006738 Py_ssize_t start;
6739 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006740
Facundo Batista57d56692007-11-16 18:04:14 +00006741 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006742 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006743
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006744 result = stringlib_find_slice(
6745 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6746 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6747 start, end
6748 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006749
6750 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006751
Guido van Rossumd57fd912000-03-10 22:53:23 +00006752 if (result < 0) {
6753 PyErr_SetString(PyExc_ValueError, "substring not found");
6754 return NULL;
6755 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006756
Martin v. Löwis18e16552006-02-15 17:27:45 +00006757 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006758}
6759
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006760PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006761"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006762\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006763Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006764at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006765
6766static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006767unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006768{
6769 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6770 register const Py_UNICODE *e;
6771 int cased;
6772
Guido van Rossumd57fd912000-03-10 22:53:23 +00006773 /* Shortcut for single character strings */
6774 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006775 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006776
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006777 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006778 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006779 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006780
Guido van Rossumd57fd912000-03-10 22:53:23 +00006781 e = p + PyUnicode_GET_SIZE(self);
6782 cased = 0;
6783 for (; p < e; p++) {
6784 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006785
Guido van Rossumd57fd912000-03-10 22:53:23 +00006786 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006787 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006788 else if (!cased && Py_UNICODE_ISLOWER(ch))
6789 cased = 1;
6790 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006791 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006792}
6793
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006794PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006795"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006796\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006797Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006798at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006799
6800static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006801unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006802{
6803 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6804 register const Py_UNICODE *e;
6805 int cased;
6806
Guido van Rossumd57fd912000-03-10 22:53:23 +00006807 /* Shortcut for single character strings */
6808 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006809 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006810
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006811 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006812 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006813 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006814
Guido van Rossumd57fd912000-03-10 22:53:23 +00006815 e = p + PyUnicode_GET_SIZE(self);
6816 cased = 0;
6817 for (; p < e; p++) {
6818 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006819
Guido van Rossumd57fd912000-03-10 22:53:23 +00006820 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006821 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006822 else if (!cased && Py_UNICODE_ISUPPER(ch))
6823 cased = 1;
6824 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006825 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006826}
6827
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006828PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006829"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006830\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006831Return True if S is a titlecased string and there is at least one\n\
6832character in S, i.e. upper- and titlecase characters may only\n\
6833follow uncased characters and lowercase characters only cased ones.\n\
6834Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006835
6836static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006837unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006838{
6839 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6840 register const Py_UNICODE *e;
6841 int cased, previous_is_cased;
6842
Guido van Rossumd57fd912000-03-10 22:53:23 +00006843 /* Shortcut for single character strings */
6844 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006845 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6846 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006847
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006848 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006849 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006850 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006851
Guido van Rossumd57fd912000-03-10 22:53:23 +00006852 e = p + PyUnicode_GET_SIZE(self);
6853 cased = 0;
6854 previous_is_cased = 0;
6855 for (; p < e; p++) {
6856 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006857
Guido van Rossumd57fd912000-03-10 22:53:23 +00006858 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6859 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006860 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006861 previous_is_cased = 1;
6862 cased = 1;
6863 }
6864 else if (Py_UNICODE_ISLOWER(ch)) {
6865 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006866 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006867 previous_is_cased = 1;
6868 cased = 1;
6869 }
6870 else
6871 previous_is_cased = 0;
6872 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006873 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006874}
6875
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006876PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006877"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006878\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006879Return True if all characters in S are whitespace\n\
6880and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006881
6882static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006883unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006884{
6885 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6886 register const Py_UNICODE *e;
6887
Guido van Rossumd57fd912000-03-10 22:53:23 +00006888 /* Shortcut for single character strings */
6889 if (PyUnicode_GET_SIZE(self) == 1 &&
6890 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006891 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006892
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006893 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006894 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006895 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006896
Guido van Rossumd57fd912000-03-10 22:53:23 +00006897 e = p + PyUnicode_GET_SIZE(self);
6898 for (; p < e; p++) {
6899 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006900 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006901 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006902 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006903}
6904
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006905PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006906"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006907\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006908Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006909and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006910
6911static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006912unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006913{
6914 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6915 register const Py_UNICODE *e;
6916
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006917 /* Shortcut for single character strings */
6918 if (PyUnicode_GET_SIZE(self) == 1 &&
6919 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006920 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006921
6922 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006923 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006924 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006925
6926 e = p + PyUnicode_GET_SIZE(self);
6927 for (; p < e; p++) {
6928 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006929 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006930 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006931 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006932}
6933
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006934PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006935"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006936\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006937Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006938and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006939
6940static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006941unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006942{
6943 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6944 register const Py_UNICODE *e;
6945
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006946 /* Shortcut for single character strings */
6947 if (PyUnicode_GET_SIZE(self) == 1 &&
6948 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006949 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006950
6951 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006952 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006953 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006954
6955 e = p + PyUnicode_GET_SIZE(self);
6956 for (; p < e; p++) {
6957 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006958 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006959 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006960 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006961}
6962
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006963PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006964"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006965\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006966Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006967False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006968
6969static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006970unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006971{
6972 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6973 register const Py_UNICODE *e;
6974
Guido van Rossumd57fd912000-03-10 22:53:23 +00006975 /* Shortcut for single character strings */
6976 if (PyUnicode_GET_SIZE(self) == 1 &&
6977 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006978 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006979
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006980 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006981 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006982 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006983
Guido van Rossumd57fd912000-03-10 22:53:23 +00006984 e = p + PyUnicode_GET_SIZE(self);
6985 for (; p < e; p++) {
6986 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006987 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006988 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006989 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006990}
6991
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006992PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006993"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006994\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006995Return True if all characters in S are digits\n\
6996and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006997
6998static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006999unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007000{
7001 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7002 register const Py_UNICODE *e;
7003
Guido van Rossumd57fd912000-03-10 22:53:23 +00007004 /* Shortcut for single character strings */
7005 if (PyUnicode_GET_SIZE(self) == 1 &&
7006 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007007 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007008
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007009 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007010 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007011 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007012
Guido van Rossumd57fd912000-03-10 22:53:23 +00007013 e = p + PyUnicode_GET_SIZE(self);
7014 for (; p < e; p++) {
7015 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007016 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007017 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007018 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007019}
7020
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007021PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007022"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007023\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007024Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007025False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007026
7027static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007028unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007029{
7030 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7031 register const Py_UNICODE *e;
7032
Guido van Rossumd57fd912000-03-10 22:53:23 +00007033 /* Shortcut for single character strings */
7034 if (PyUnicode_GET_SIZE(self) == 1 &&
7035 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007036 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007037
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007038 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007039 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007040 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007041
Guido van Rossumd57fd912000-03-10 22:53:23 +00007042 e = p + PyUnicode_GET_SIZE(self);
7043 for (; p < e; p++) {
7044 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007045 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007046 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007047 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007048}
7049
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007050PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007051"S.join(sequence) -> unicode\n\
7052\n\
7053Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007054sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007055
7056static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007057unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007058{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007059 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007060}
7061
Martin v. Löwis18e16552006-02-15 17:27:45 +00007062static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007063unicode_length(PyUnicodeObject *self)
7064{
7065 return self->length;
7066}
7067
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007068PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00007069"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007070\n\
7071Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007072done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007073
7074static PyObject *
7075unicode_ljust(PyUnicodeObject *self, PyObject *args)
7076{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007077 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007078 Py_UNICODE fillchar = ' ';
7079
Martin v. Löwis412fb672006-04-13 06:34:32 +00007080 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007081 return NULL;
7082
Tim Peters7a29bd52001-09-12 03:03:31 +00007083 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007084 Py_INCREF(self);
7085 return (PyObject*) self;
7086 }
7087
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007088 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007089}
7090
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007091PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007092"S.lower() -> unicode\n\
7093\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007094Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007095
7096static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007097unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007098{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007099 return fixup(self, fixlower);
7100}
7101
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007102#define LEFTSTRIP 0
7103#define RIGHTSTRIP 1
7104#define BOTHSTRIP 2
7105
7106/* Arrays indexed by above */
7107static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7108
7109#define STRIPNAME(i) (stripformat[i]+3)
7110
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007111/* externally visible for str.strip(unicode) */
7112PyObject *
7113_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7114{
7115 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007116 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007117 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007118 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7119 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007120
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007121 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7122
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007123 i = 0;
7124 if (striptype != RIGHTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007125 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7126 i++;
7127 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007128 }
7129
7130 j = len;
7131 if (striptype != LEFTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007132 do {
7133 j--;
7134 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7135 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007136 }
7137
7138 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007139 Py_INCREF(self);
7140 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007141 }
7142 else
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007143 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007144}
7145
Guido van Rossumd57fd912000-03-10 22:53:23 +00007146
7147static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007148do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007149{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007150 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007151 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007152
7153 i = 0;
7154 if (striptype != RIGHTSTRIP) {
7155 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7156 i++;
7157 }
7158 }
7159
7160 j = len;
7161 if (striptype != LEFTSTRIP) {
7162 do {
7163 j--;
7164 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7165 j++;
7166 }
7167
7168 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7169 Py_INCREF(self);
7170 return (PyObject*)self;
7171 }
7172 else
7173 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007174}
7175
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007176
7177static PyObject *
7178do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7179{
7180 PyObject *sep = NULL;
7181
7182 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7183 return NULL;
7184
7185 if (sep != NULL && sep != Py_None) {
7186 if (PyUnicode_Check(sep))
7187 return _PyUnicode_XStrip(self, striptype, sep);
Gregory P. Smithdd96db62008-06-09 04:58:54 +00007188 else if (PyString_Check(sep)) {
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007189 PyObject *res;
7190 sep = PyUnicode_FromObject(sep);
7191 if (sep==NULL)
7192 return NULL;
7193 res = _PyUnicode_XStrip(self, striptype, sep);
7194 Py_DECREF(sep);
7195 return res;
7196 }
7197 else {
7198 PyErr_Format(PyExc_TypeError,
7199 "%s arg must be None, unicode or str",
7200 STRIPNAME(striptype));
7201 return NULL;
7202 }
7203 }
7204
7205 return do_strip(self, striptype);
7206}
7207
7208
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007209PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007210"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007211\n\
7212Return a copy of the string S with leading and trailing\n\
7213whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007214If chars is given and not None, remove characters in chars instead.\n\
7215If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007216
7217static PyObject *
7218unicode_strip(PyUnicodeObject *self, PyObject *args)
7219{
7220 if (PyTuple_GET_SIZE(args) == 0)
7221 return do_strip(self, BOTHSTRIP); /* Common case */
7222 else
7223 return do_argstrip(self, BOTHSTRIP, args);
7224}
7225
7226
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007227PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007228"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007229\n\
7230Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007231If chars is given and not None, remove characters in chars instead.\n\
7232If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007233
7234static PyObject *
7235unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7236{
7237 if (PyTuple_GET_SIZE(args) == 0)
7238 return do_strip(self, LEFTSTRIP); /* Common case */
7239 else
7240 return do_argstrip(self, LEFTSTRIP, args);
7241}
7242
7243
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007244PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007245"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007246\n\
7247Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007248If chars is given and not None, remove characters in chars instead.\n\
7249If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007250
7251static PyObject *
7252unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7253{
7254 if (PyTuple_GET_SIZE(args) == 0)
7255 return do_strip(self, RIGHTSTRIP); /* Common case */
7256 else
7257 return do_argstrip(self, RIGHTSTRIP, args);
7258}
7259
7260
Guido van Rossumd57fd912000-03-10 22:53:23 +00007261static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007262unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007263{
7264 PyUnicodeObject *u;
7265 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007266 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007267 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007268
7269 if (len < 0)
7270 len = 0;
7271
Tim Peters7a29bd52001-09-12 03:03:31 +00007272 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007273 /* no repeat, return original string */
7274 Py_INCREF(str);
7275 return (PyObject*) str;
7276 }
Tim Peters8f422462000-09-09 06:13:41 +00007277
7278 /* ensure # of chars needed doesn't overflow int and # of bytes
7279 * needed doesn't overflow size_t
7280 */
7281 nchars = len * str->length;
7282 if (len && nchars / len != str->length) {
7283 PyErr_SetString(PyExc_OverflowError,
7284 "repeated string is too long");
7285 return NULL;
7286 }
7287 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7288 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7289 PyErr_SetString(PyExc_OverflowError,
7290 "repeated string is too long");
7291 return NULL;
7292 }
7293 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007294 if (!u)
7295 return NULL;
7296
7297 p = u->str;
7298
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007299 if (str->length == 1 && len > 0) {
7300 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007301 } else {
Tim Peters1bacc642006-05-23 05:47:16 +00007302 Py_ssize_t done = 0; /* number of characters copied this far */
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007303 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007304 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007305 done = str->length;
7306 }
7307 while (done < nchars) {
Neal Norwitz4677fbf72008-03-25 04:18:18 +00007308 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007309 Py_UNICODE_COPY(p+done, p, n);
7310 done += n;
7311 }
7312 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007313
7314 return (PyObject*) u;
7315}
7316
7317PyObject *PyUnicode_Replace(PyObject *obj,
7318 PyObject *subobj,
7319 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007320 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007321{
7322 PyObject *self;
7323 PyObject *str1;
7324 PyObject *str2;
7325 PyObject *result;
7326
7327 self = PyUnicode_FromObject(obj);
7328 if (self == NULL)
7329 return NULL;
7330 str1 = PyUnicode_FromObject(subobj);
7331 if (str1 == NULL) {
7332 Py_DECREF(self);
7333 return NULL;
7334 }
7335 str2 = PyUnicode_FromObject(replobj);
7336 if (str2 == NULL) {
7337 Py_DECREF(self);
7338 Py_DECREF(str1);
7339 return NULL;
7340 }
Tim Petersced69f82003-09-16 20:30:58 +00007341 result = replace((PyUnicodeObject *)self,
7342 (PyUnicodeObject *)str1,
7343 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007344 maxcount);
7345 Py_DECREF(self);
7346 Py_DECREF(str1);
7347 Py_DECREF(str2);
7348 return result;
7349}
7350
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007351PyDoc_STRVAR(replace__doc__,
Georg Brandl30fadc12008-05-30 07:54:16 +00007352"S.replace (old, new[, count]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007353\n\
7354Return a copy of S with all occurrences of substring\n\
Georg Brandl30fadc12008-05-30 07:54:16 +00007355old replaced by new. If the optional argument count is\n\
7356given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007357
7358static PyObject*
7359unicode_replace(PyUnicodeObject *self, PyObject *args)
7360{
7361 PyUnicodeObject *str1;
7362 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007363 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007364 PyObject *result;
7365
Martin v. Löwis18e16552006-02-15 17:27:45 +00007366 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007367 return NULL;
7368 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7369 if (str1 == NULL)
7370 return NULL;
7371 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007372 if (str2 == NULL) {
7373 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007374 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007375 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007376
7377 result = replace(self, str1, str2, maxcount);
7378
7379 Py_DECREF(str1);
7380 Py_DECREF(str2);
7381 return result;
7382}
7383
7384static
7385PyObject *unicode_repr(PyObject *unicode)
7386{
7387 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
7388 PyUnicode_GET_SIZE(unicode),
7389 1);
7390}
7391
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007392PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007393"S.rfind(sub [,start [,end]]) -> int\n\
7394\n\
7395Return the highest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00007396such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007397arguments start and end are interpreted as in slice notation.\n\
7398\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007399Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007400
7401static PyObject *
7402unicode_rfind(PyUnicodeObject *self, PyObject *args)
7403{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007404 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007405 Py_ssize_t start;
7406 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007407 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007408
Facundo Batista57d56692007-11-16 18:04:14 +00007409 if (!_ParseTupleFinds(args, &substring, &start, &end))
7410 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007411
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007412 result = stringlib_rfind_slice(
7413 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7414 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7415 start, end
7416 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007417
7418 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007419
7420 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007421}
7422
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007423PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007424"S.rindex(sub [,start [,end]]) -> int\n\
7425\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007426Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007427
7428static PyObject *
7429unicode_rindex(PyUnicodeObject *self, PyObject *args)
7430{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007431 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007432 Py_ssize_t start;
7433 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007434 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007435
Facundo Batista57d56692007-11-16 18:04:14 +00007436 if (!_ParseTupleFinds(args, &substring, &start, &end))
7437 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007438
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007439 result = stringlib_rfind_slice(
7440 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7441 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7442 start, end
7443 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007444
7445 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007446
Guido van Rossumd57fd912000-03-10 22:53:23 +00007447 if (result < 0) {
7448 PyErr_SetString(PyExc_ValueError, "substring not found");
7449 return NULL;
7450 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007451 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007452}
7453
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007454PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007455"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007456\n\
7457Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007458done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007459
7460static PyObject *
7461unicode_rjust(PyUnicodeObject *self, PyObject *args)
7462{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007463 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007464 Py_UNICODE fillchar = ' ';
7465
Martin v. Löwis412fb672006-04-13 06:34:32 +00007466 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007467 return NULL;
7468
Tim Peters7a29bd52001-09-12 03:03:31 +00007469 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007470 Py_INCREF(self);
7471 return (PyObject*) self;
7472 }
7473
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007474 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007475}
7476
Guido van Rossumd57fd912000-03-10 22:53:23 +00007477static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007478unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007479{
7480 /* standard clamping */
7481 if (start < 0)
7482 start = 0;
7483 if (end < 0)
7484 end = 0;
7485 if (end > self->length)
7486 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007487 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007488 /* full slice, return original string */
7489 Py_INCREF(self);
7490 return (PyObject*) self;
7491 }
7492 if (start > end)
7493 start = end;
7494 /* copy slice */
7495 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7496 end - start);
7497}
7498
7499PyObject *PyUnicode_Split(PyObject *s,
7500 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007501 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007502{
7503 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007504
Guido van Rossumd57fd912000-03-10 22:53:23 +00007505 s = PyUnicode_FromObject(s);
7506 if (s == NULL)
7507 return NULL;
7508 if (sep != NULL) {
7509 sep = PyUnicode_FromObject(sep);
7510 if (sep == NULL) {
7511 Py_DECREF(s);
7512 return NULL;
7513 }
7514 }
7515
7516 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7517
7518 Py_DECREF(s);
7519 Py_XDECREF(sep);
7520 return result;
7521}
7522
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007523PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007524"S.split([sep [,maxsplit]]) -> list of strings\n\
7525\n\
7526Return a list of the words in S, using sep as the\n\
7527delimiter string. If maxsplit is given, at most maxsplit\n\
Georg Brandldfb77db2008-05-11 09:11:40 +00007528splits are done. If sep is not specified or is None, any\n\
Georg Brandlecbbd942008-05-11 20:53:55 +00007529whitespace string is a separator and empty strings are\n\
7530removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007531
7532static PyObject*
7533unicode_split(PyUnicodeObject *self, PyObject *args)
7534{
7535 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007536 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007537
Martin v. Löwis18e16552006-02-15 17:27:45 +00007538 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007539 return NULL;
7540
7541 if (substring == Py_None)
7542 return split(self, NULL, maxcount);
7543 else if (PyUnicode_Check(substring))
7544 return split(self, (PyUnicodeObject *)substring, maxcount);
7545 else
7546 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7547}
7548
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007549PyObject *
7550PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7551{
7552 PyObject* str_obj;
7553 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007554 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00007555
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007556 str_obj = PyUnicode_FromObject(str_in);
7557 if (!str_obj)
7558 return NULL;
7559 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00007560 if (!sep_obj) {
7561 Py_DECREF(str_obj);
7562 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007563 }
7564
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007565 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00007566 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7567 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7568 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007569
Fredrik Lundhb9479482006-05-26 17:22:38 +00007570 Py_DECREF(sep_obj);
7571 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007572
7573 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007574}
7575
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007576
7577PyObject *
7578PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7579{
7580 PyObject* str_obj;
7581 PyObject* sep_obj;
7582 PyObject* out;
7583
7584 str_obj = PyUnicode_FromObject(str_in);
7585 if (!str_obj)
7586 return NULL;
7587 sep_obj = PyUnicode_FromObject(sep_in);
7588 if (!sep_obj) {
7589 Py_DECREF(str_obj);
7590 return NULL;
7591 }
7592
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007593 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007594 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7595 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7596 );
7597
7598 Py_DECREF(sep_obj);
7599 Py_DECREF(str_obj);
7600
7601 return out;
7602}
7603
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007604PyDoc_STRVAR(partition__doc__,
7605"S.partition(sep) -> (head, sep, tail)\n\
7606\n\
7607Searches for the separator sep in S, and returns the part before it,\n\
7608the separator itself, and the part after it. If the separator is not\n\
7609found, returns S and two empty strings.");
7610
7611static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00007612unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007613{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007614 return PyUnicode_Partition((PyObject *)self, separator);
7615}
7616
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007617PyDoc_STRVAR(rpartition__doc__,
Raymond Hettingera0c95fa2006-09-04 15:32:48 +00007618"S.rpartition(sep) -> (tail, sep, head)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007619\n\
7620Searches for the separator sep in S, starting at the end of S, and returns\n\
7621the part before it, the separator itself, and the part after it. If the\n\
Raymond Hettingera0c95fa2006-09-04 15:32:48 +00007622separator is not found, returns two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007623
7624static PyObject*
7625unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7626{
7627 return PyUnicode_RPartition((PyObject *)self, separator);
7628}
7629
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007630PyObject *PyUnicode_RSplit(PyObject *s,
7631 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007632 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007633{
7634 PyObject *result;
7635
7636 s = PyUnicode_FromObject(s);
7637 if (s == NULL)
7638 return NULL;
7639 if (sep != NULL) {
7640 sep = PyUnicode_FromObject(sep);
7641 if (sep == NULL) {
7642 Py_DECREF(s);
7643 return NULL;
7644 }
7645 }
7646
7647 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7648
7649 Py_DECREF(s);
7650 Py_XDECREF(sep);
7651 return result;
7652}
7653
7654PyDoc_STRVAR(rsplit__doc__,
7655"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7656\n\
7657Return a list of the words in S, using sep as the\n\
7658delimiter string, starting at the end of the string and\n\
7659working to the front. If maxsplit is given, at most maxsplit\n\
7660splits are done. If sep is not specified, any whitespace string\n\
7661is a separator.");
7662
7663static PyObject*
7664unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7665{
7666 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007667 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007668
Martin v. Löwis18e16552006-02-15 17:27:45 +00007669 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007670 return NULL;
7671
7672 if (substring == Py_None)
7673 return rsplit(self, NULL, maxcount);
7674 else if (PyUnicode_Check(substring))
7675 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7676 else
7677 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7678}
7679
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007680PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007681"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007682\n\
7683Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007684Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007685is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007686
7687static PyObject*
7688unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7689{
Guido van Rossum86662912000-04-11 15:38:46 +00007690 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007691
Guido van Rossum86662912000-04-11 15:38:46 +00007692 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007693 return NULL;
7694
Guido van Rossum86662912000-04-11 15:38:46 +00007695 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007696}
7697
7698static
7699PyObject *unicode_str(PyUnicodeObject *self)
7700{
Fred Drakee4315f52000-05-09 19:53:39 +00007701 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007702}
7703
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007704PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007705"S.swapcase() -> unicode\n\
7706\n\
7707Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007708and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007709
7710static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007711unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007712{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007713 return fixup(self, fixswapcase);
7714}
7715
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007716PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007717"S.translate(table) -> unicode\n\
7718\n\
7719Return a copy of the string S, where all characters have been mapped\n\
7720through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007721Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7722Unmapped characters are left untouched. Characters mapped to None\n\
7723are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007724
7725static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007726unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007727{
Tim Petersced69f82003-09-16 20:30:58 +00007728 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007729 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00007730 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007731 "ignore");
7732}
7733
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007734PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007735"S.upper() -> unicode\n\
7736\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007737Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007738
7739static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007740unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007741{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007742 return fixup(self, fixupper);
7743}
7744
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007745PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007746"S.zfill(width) -> unicode\n\
7747\n\
7748Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007749of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007750
7751static PyObject *
7752unicode_zfill(PyUnicodeObject *self, PyObject *args)
7753{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007754 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007755 PyUnicodeObject *u;
7756
Martin v. Löwis18e16552006-02-15 17:27:45 +00007757 Py_ssize_t width;
7758 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007759 return NULL;
7760
7761 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007762 if (PyUnicode_CheckExact(self)) {
7763 Py_INCREF(self);
7764 return (PyObject*) self;
7765 }
7766 else
7767 return PyUnicode_FromUnicode(
7768 PyUnicode_AS_UNICODE(self),
7769 PyUnicode_GET_SIZE(self)
7770 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007771 }
7772
7773 fill = width - self->length;
7774
7775 u = pad(self, fill, 0, '0');
7776
Walter Dörwald068325e2002-04-15 13:36:47 +00007777 if (u == NULL)
7778 return NULL;
7779
Guido van Rossumd57fd912000-03-10 22:53:23 +00007780 if (u->str[fill] == '+' || u->str[fill] == '-') {
7781 /* move sign to beginning of string */
7782 u->str[0] = u->str[fill];
7783 u->str[fill] = '0';
7784 }
7785
7786 return (PyObject*) u;
7787}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007788
7789#if 0
7790static PyObject*
Christian Heimes5b970ad2008-02-06 13:33:44 +00007791free_listsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007792{
Christian Heimes5b970ad2008-02-06 13:33:44 +00007793 return PyInt_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007794}
7795#endif
7796
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007797PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007798"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007799\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007800Return True if S starts with the specified prefix, False otherwise.\n\
7801With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007802With optional end, stop comparing S at that position.\n\
7803prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007804
7805static PyObject *
7806unicode_startswith(PyUnicodeObject *self,
7807 PyObject *args)
7808{
Georg Brandl24250812006-06-09 18:45:48 +00007809 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007810 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007811 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007812 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007813 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007814
Georg Brandl24250812006-06-09 18:45:48 +00007815 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007816 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007817 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007818 if (PyTuple_Check(subobj)) {
7819 Py_ssize_t i;
7820 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7821 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7822 PyTuple_GET_ITEM(subobj, i));
7823 if (substring == NULL)
7824 return NULL;
7825 result = tailmatch(self, substring, start, end, -1);
7826 Py_DECREF(substring);
7827 if (result) {
7828 Py_RETURN_TRUE;
7829 }
7830 }
7831 /* nothing matched */
7832 Py_RETURN_FALSE;
7833 }
7834 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007835 if (substring == NULL)
Georg Brandl24250812006-06-09 18:45:48 +00007836 return NULL;
7837 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007838 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007839 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007840}
7841
7842
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007843PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007844"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007845\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007846Return True if S ends with the specified suffix, False otherwise.\n\
7847With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007848With optional end, stop comparing S at that position.\n\
7849suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007850
7851static PyObject *
7852unicode_endswith(PyUnicodeObject *self,
7853 PyObject *args)
7854{
Georg Brandl24250812006-06-09 18:45:48 +00007855 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007856 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007857 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007858 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007859 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007860
Georg Brandl24250812006-06-09 18:45:48 +00007861 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7862 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007863 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007864 if (PyTuple_Check(subobj)) {
7865 Py_ssize_t i;
7866 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7867 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7868 PyTuple_GET_ITEM(subobj, i));
7869 if (substring == NULL)
7870 return NULL;
7871 result = tailmatch(self, substring, start, end, +1);
7872 Py_DECREF(substring);
7873 if (result) {
7874 Py_RETURN_TRUE;
7875 }
7876 }
7877 Py_RETURN_FALSE;
7878 }
7879 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007880 if (substring == NULL)
Georg Brandl24250812006-06-09 18:45:48 +00007881 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007882
Georg Brandl24250812006-06-09 18:45:48 +00007883 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007884 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007885 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007886}
7887
7888
Eric Smitha9f7d622008-02-17 19:46:49 +00007889/* Implements do_string_format, which is unicode because of stringlib */
7890#include "stringlib/string_format.h"
7891
7892PyDoc_STRVAR(format__doc__,
7893"S.format(*args, **kwargs) -> unicode\n\
7894\n\
7895");
7896
Eric Smithdc13b792008-05-30 18:10:04 +00007897static PyObject *
7898unicode__format__(PyObject *self, PyObject *args)
7899{
7900 PyObject *format_spec;
7901 PyObject *result = NULL;
7902 PyObject *tmp = NULL;
7903
7904 /* If 2.x, convert format_spec to the same type as value */
7905 /* This is to allow things like u''.format('') */
7906 if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7907 goto done;
7908 if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7909 PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
7910 "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
7911 goto done;
7912 }
7913 tmp = PyObject_Unicode(format_spec);
7914 if (tmp == NULL)
7915 goto done;
7916 format_spec = tmp;
7917
7918 result = _PyUnicode_FormatAdvanced(self,
7919 PyUnicode_AS_UNICODE(format_spec),
7920 PyUnicode_GET_SIZE(format_spec));
7921done:
7922 Py_XDECREF(tmp);
7923 return result;
7924}
7925
Eric Smitha9f7d622008-02-17 19:46:49 +00007926PyDoc_STRVAR(p_format__doc__,
7927"S.__format__(format_spec) -> unicode\n\
7928\n\
7929");
7930
Robert Schuppenies901c9972008-06-10 10:10:31 +00007931static PyObject *
7932unicode__sizeof__(PyUnicodeObject *v)
7933{
Robert Schuppenies9be2ec12008-07-10 15:24:04 +00007934 return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7935 sizeof(Py_UNICODE) * (v->length + 1));
Robert Schuppenies901c9972008-06-10 10:10:31 +00007936}
7937
7938PyDoc_STRVAR(sizeof__doc__,
7939"S.__sizeof__() -> size of S in memory, in bytes\n\
7940\n\
7941");
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007942
7943static PyObject *
7944unicode_getnewargs(PyUnicodeObject *v)
7945{
7946 return Py_BuildValue("(u#)", v->str, v->length);
7947}
7948
7949
Guido van Rossumd57fd912000-03-10 22:53:23 +00007950static PyMethodDef unicode_methods[] = {
7951
7952 /* Order is according to common usage: often used methods should
7953 appear first, since lookup is done sequentially. */
7954
Georg Brandlecdc0a92006-03-30 12:19:07 +00007955 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007956 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7957 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007958 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007959 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7960 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7961 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7962 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7963 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7964 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7965 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00007966 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007967 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7968 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7969 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007970 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007971 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007972/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7973 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7974 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7975 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007976 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007977 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007978 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007979 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007980 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7981 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7982 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7983 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7984 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7985 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7986 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7987 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7988 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7989 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7990 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7991 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7992 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7993 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007994 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smitha9f7d622008-02-17 19:46:49 +00007995 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7996 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
7997 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
7998 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Robert Schuppenies901c9972008-06-10 10:10:31 +00007999 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008000#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008001 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008002#endif
8003
8004#if 0
8005 /* This one is just used for debugging the implementation. */
Christian Heimes5b970ad2008-02-06 13:33:44 +00008006 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008007#endif
8008
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008009 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008010 {NULL, NULL}
8011};
8012
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008013static PyObject *
8014unicode_mod(PyObject *v, PyObject *w)
8015{
8016 if (!PyUnicode_Check(v)) {
8017 Py_INCREF(Py_NotImplemented);
8018 return Py_NotImplemented;
8019 }
8020 return PyUnicode_Format(v, w);
8021}
8022
8023static PyNumberMethods unicode_as_number = {
8024 0, /*nb_add*/
8025 0, /*nb_subtract*/
8026 0, /*nb_multiply*/
8027 0, /*nb_divide*/
8028 unicode_mod, /*nb_remainder*/
8029};
8030
Guido van Rossumd57fd912000-03-10 22:53:23 +00008031static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008032 (lenfunc) unicode_length, /* sq_length */
Georg Brandl347b3002006-03-30 11:57:00 +00008033 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008034 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8035 (ssizeargfunc) unicode_getitem, /* sq_item */
8036 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008037 0, /* sq_ass_item */
8038 0, /* sq_ass_slice */
Georg Brandl347b3002006-03-30 11:57:00 +00008039 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008040};
8041
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008042static PyObject*
8043unicode_subscript(PyUnicodeObject* self, PyObject* item)
8044{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00008045 if (PyIndex_Check(item)) {
8046 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008047 if (i == -1 && PyErr_Occurred())
8048 return NULL;
8049 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008050 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008051 return unicode_getitem(self, i);
8052 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008053 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008054 Py_UNICODE* source_buf;
8055 Py_UNICODE* result_buf;
8056 PyObject* result;
8057
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008058 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008059 &start, &stop, &step, &slicelength) < 0) {
8060 return NULL;
8061 }
8062
8063 if (slicelength <= 0) {
8064 return PyUnicode_FromUnicode(NULL, 0);
Thomas Wouters3ccec682007-08-28 15:28:19 +00008065 } else if (start == 0 && step == 1 && slicelength == self->length &&
8066 PyUnicode_CheckExact(self)) {
8067 Py_INCREF(self);
8068 return (PyObject *)self;
8069 } else if (step == 1) {
8070 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008071 } else {
8072 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Neal Norwitz419fd492008-03-17 20:22:43 +00008073 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8074 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008075
8076 if (result_buf == NULL)
8077 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008078
8079 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8080 result_buf[i] = source_buf[cur];
8081 }
Tim Petersced69f82003-09-16 20:30:58 +00008082
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008083 result = PyUnicode_FromUnicode(result_buf, slicelength);
Neal Norwitz419fd492008-03-17 20:22:43 +00008084 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008085 return result;
8086 }
8087 } else {
8088 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8089 return NULL;
8090 }
8091}
8092
8093static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008094 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008095 (binaryfunc)unicode_subscript, /* mp_subscript */
8096 (objobjargproc)0, /* mp_ass_subscript */
8097};
8098
Martin v. Löwis18e16552006-02-15 17:27:45 +00008099static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008100unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00008101 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008102 const void **ptr)
8103{
8104 if (index != 0) {
8105 PyErr_SetString(PyExc_SystemError,
8106 "accessing non-existent unicode segment");
8107 return -1;
8108 }
8109 *ptr = (void *) self->str;
8110 return PyUnicode_GET_DATA_SIZE(self);
8111}
8112
Martin v. Löwis18e16552006-02-15 17:27:45 +00008113static Py_ssize_t
8114unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008115 const void **ptr)
8116{
8117 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00008118 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008119 return -1;
8120}
8121
8122static int
8123unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00008124 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008125{
8126 if (lenp)
8127 *lenp = PyUnicode_GET_DATA_SIZE(self);
8128 return 1;
8129}
8130
Martin v. Löwiseb079f12006-02-16 14:32:27 +00008131static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008132unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00008133 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008134 const void **ptr)
8135{
8136 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00008137
Guido van Rossumd57fd912000-03-10 22:53:23 +00008138 if (index != 0) {
8139 PyErr_SetString(PyExc_SystemError,
8140 "accessing non-existent unicode segment");
8141 return -1;
8142 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008143 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008144 if (str == NULL)
8145 return -1;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008146 *ptr = (void *) PyString_AS_STRING(str);
8147 return PyString_GET_SIZE(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008148}
8149
8150/* Helpers for PyUnicode_Format() */
8151
8152static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008153getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008154{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008155 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008156 if (argidx < arglen) {
8157 (*p_argidx)++;
8158 if (arglen < 0)
8159 return args;
8160 else
8161 return PyTuple_GetItem(args, argidx);
8162 }
8163 PyErr_SetString(PyExc_TypeError,
8164 "not enough arguments for format string");
8165 return NULL;
8166}
8167
8168#define F_LJUST (1<<0)
8169#define F_SIGN (1<<1)
8170#define F_BLANK (1<<2)
8171#define F_ALT (1<<3)
8172#define F_ZERO (1<<4)
8173
Martin v. Löwis18e16552006-02-15 17:27:45 +00008174static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008175strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008176{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008177 register Py_ssize_t i;
8178 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008179 for (i = len - 1; i >= 0; i--)
8180 buffer[i] = (Py_UNICODE) charbuffer[i];
8181
Guido van Rossumd57fd912000-03-10 22:53:23 +00008182 return len;
8183}
8184
Neal Norwitzfc76d632006-01-10 06:03:13 +00008185static int
8186doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8187{
Tim Peters15231542006-02-16 01:08:01 +00008188 Py_ssize_t result;
8189
Neal Norwitzfc76d632006-01-10 06:03:13 +00008190 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008191 result = strtounicode(buffer, (char *)buffer);
8192 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008193}
8194
8195static int
8196longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8197{
Tim Peters15231542006-02-16 01:08:01 +00008198 Py_ssize_t result;
8199
Neal Norwitzfc76d632006-01-10 06:03:13 +00008200 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008201 result = strtounicode(buffer, (char *)buffer);
8202 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008203}
8204
Guido van Rossum078151d2002-08-11 04:24:12 +00008205/* XXX To save some code duplication, formatfloat/long/int could have been
8206 shared with stringobject.c, converting from 8-bit to Unicode after the
8207 formatting is done. */
8208
Guido van Rossumd57fd912000-03-10 22:53:23 +00008209static int
8210formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008211 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008212 int flags,
8213 int prec,
8214 int type,
8215 PyObject *v)
8216{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008217 /* fmt = '%#.' + `prec` + `type`
8218 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00008219 char fmt[20];
8220 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008221
Guido van Rossumd57fd912000-03-10 22:53:23 +00008222 x = PyFloat_AsDouble(v);
8223 if (x == -1.0 && PyErr_Occurred())
8224 return -1;
8225 if (prec < 0)
8226 prec = 6;
Eric Smithd6c393a2008-07-17 19:49:47 +00008227 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8228 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008229 /* Worst case length calc to ensure no buffer overrun:
8230
8231 'g' formats:
8232 fmt = %#.<prec>g
8233 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8234 for any double rep.)
8235 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8236
8237 'f' formats:
8238 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8239 len = 1 + 50 + 1 + prec = 52 + prec
8240
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008241 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008242 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008243
8244 */
Georg Brandl7c3b50d2007-07-12 08:38:00 +00008245 if (((type == 'g' || type == 'G') &&
8246 buflen <= (size_t)10 + (size_t)prec) ||
Eric Smithd6c393a2008-07-17 19:49:47 +00008247 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008248 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008249 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008250 return -1;
8251 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008252 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8253 (flags&F_ALT) ? "#" : "",
8254 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008255 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008256}
8257
Tim Peters38fd5b62000-09-21 05:43:11 +00008258static PyObject*
8259formatlong(PyObject *val, int flags, int prec, int type)
8260{
8261 char *buf;
8262 int i, len;
8263 PyObject *str; /* temporary string object. */
8264 PyUnicodeObject *result;
8265
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008266 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
Tim Peters38fd5b62000-09-21 05:43:11 +00008267 if (!str)
8268 return NULL;
8269 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008270 if (!result) {
8271 Py_DECREF(str);
8272 return NULL;
8273 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008274 for (i = 0; i < len; i++)
8275 result->str[i] = buf[i];
8276 result->str[len] = 0;
8277 Py_DECREF(str);
8278 return (PyObject*)result;
8279}
8280
Guido van Rossumd57fd912000-03-10 22:53:23 +00008281static int
8282formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008283 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008284 int flags,
8285 int prec,
8286 int type,
8287 PyObject *v)
8288{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008289 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008290 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8291 * + 1 + 1
8292 * = 24
8293 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008294 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008295 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008296 long x;
8297
8298 x = PyInt_AsLong(v);
8299 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008300 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008301 if (x < 0 && type == 'u') {
8302 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008303 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008304 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8305 sign = "-";
8306 else
8307 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008308 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008309 prec = 1;
8310
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008311 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8312 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008313 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008314 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008315 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008316 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008317 return -1;
8318 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008319
8320 if ((flags & F_ALT) &&
8321 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00008322 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008323 * of issues that cause pain:
8324 * - when 0 is being converted, the C standard leaves off
8325 * the '0x' or '0X', which is inconsistent with other
8326 * %#x/%#X conversions and inconsistent with Python's
8327 * hex() function
8328 * - there are platforms that violate the standard and
8329 * convert 0 with the '0x' or '0X'
8330 * (Metrowerks, Compaq Tru64)
8331 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008332 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008333 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008334 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008335 * We can achieve the desired consistency by inserting our
8336 * own '0x' or '0X' prefix, and substituting %x/%X in place
8337 * of %#x/%#X.
8338 *
8339 * Note that this is the same approach as used in
8340 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008341 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008342 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8343 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008344 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008345 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008346 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8347 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008348 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008349 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008350 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008351 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008352 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008353 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008354}
8355
8356static int
8357formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008358 size_t buflen,
8359 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008360{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008361 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008362 if (PyUnicode_Check(v)) {
8363 if (PyUnicode_GET_SIZE(v) != 1)
8364 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008365 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008366 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008367
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008368 else if (PyString_Check(v)) {
8369 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008370 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008371 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008372 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008373
8374 else {
8375 /* Integer input truncated to a character */
8376 long x;
8377 x = PyInt_AsLong(v);
8378 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008379 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008380#ifdef Py_UNICODE_WIDE
8381 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008382 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008383 "%c arg not in range(0x110000) "
8384 "(wide Python build)");
8385 return -1;
8386 }
8387#else
8388 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008389 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008390 "%c arg not in range(0x10000) "
8391 "(narrow Python build)");
8392 return -1;
8393 }
8394#endif
8395 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008396 }
8397 buf[1] = '\0';
8398 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008399
8400 onError:
8401 PyErr_SetString(PyExc_TypeError,
8402 "%c requires int or char");
8403 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008404}
8405
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008406/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8407
8408 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8409 chars are formatted. XXX This is a magic number. Each formatting
8410 routine does bounds checking to ensure no overflow, but a better
8411 solution may be to malloc a buffer of appropriate size for each
8412 format. For now, the current solution is sufficient.
8413*/
8414#define FORMATBUFLEN (size_t)120
8415
Guido van Rossumd57fd912000-03-10 22:53:23 +00008416PyObject *PyUnicode_Format(PyObject *format,
8417 PyObject *args)
8418{
8419 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008420 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008421 int args_owned = 0;
8422 PyUnicodeObject *result = NULL;
8423 PyObject *dict = NULL;
8424 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008425
Guido van Rossumd57fd912000-03-10 22:53:23 +00008426 if (format == NULL || args == NULL) {
8427 PyErr_BadInternalCall();
8428 return NULL;
8429 }
8430 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008431 if (uformat == NULL)
8432 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008433 fmt = PyUnicode_AS_UNICODE(uformat);
8434 fmtcnt = PyUnicode_GET_SIZE(uformat);
8435
8436 reslen = rescnt = fmtcnt + 100;
8437 result = _PyUnicode_New(reslen);
8438 if (result == NULL)
8439 goto onError;
8440 res = PyUnicode_AS_UNICODE(result);
8441
8442 if (PyTuple_Check(args)) {
8443 arglen = PyTuple_Size(args);
8444 argidx = 0;
8445 }
8446 else {
8447 arglen = -1;
8448 argidx = -2;
8449 }
Christian Heimese93237d2007-12-19 02:37:44 +00008450 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008451 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008452 dict = args;
8453
8454 while (--fmtcnt >= 0) {
8455 if (*fmt != '%') {
8456 if (--rescnt < 0) {
8457 rescnt = fmtcnt + 100;
8458 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008459 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008460 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008461 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8462 --rescnt;
8463 }
8464 *res++ = *fmt++;
8465 }
8466 else {
8467 /* Got a format specifier */
8468 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008469 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008470 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008471 Py_UNICODE c = '\0';
8472 Py_UNICODE fill;
Facundo Batistac11cecf2008-02-24 03:17:21 +00008473 int isnumok;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008474 PyObject *v = NULL;
8475 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008476 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008477 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008478 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008479 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008480
8481 fmt++;
8482 if (*fmt == '(') {
8483 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008484 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008485 PyObject *key;
8486 int pcount = 1;
8487
8488 if (dict == NULL) {
8489 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008490 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008491 goto onError;
8492 }
8493 ++fmt;
8494 --fmtcnt;
8495 keystart = fmt;
8496 /* Skip over balanced parentheses */
8497 while (pcount > 0 && --fmtcnt >= 0) {
8498 if (*fmt == ')')
8499 --pcount;
8500 else if (*fmt == '(')
8501 ++pcount;
8502 fmt++;
8503 }
8504 keylen = fmt - keystart - 1;
8505 if (fmtcnt < 0 || pcount > 0) {
8506 PyErr_SetString(PyExc_ValueError,
8507 "incomplete format key");
8508 goto onError;
8509 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008510#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008511 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008512 then looked up since Python uses strings to hold
8513 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008514 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008515 key = PyUnicode_EncodeUTF8(keystart,
8516 keylen,
8517 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008518#else
8519 key = PyUnicode_FromUnicode(keystart, keylen);
8520#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008521 if (key == NULL)
8522 goto onError;
8523 if (args_owned) {
8524 Py_DECREF(args);
8525 args_owned = 0;
8526 }
8527 args = PyObject_GetItem(dict, key);
8528 Py_DECREF(key);
8529 if (args == NULL) {
8530 goto onError;
8531 }
8532 args_owned = 1;
8533 arglen = -1;
8534 argidx = -2;
8535 }
8536 while (--fmtcnt >= 0) {
8537 switch (c = *fmt++) {
8538 case '-': flags |= F_LJUST; continue;
8539 case '+': flags |= F_SIGN; continue;
8540 case ' ': flags |= F_BLANK; continue;
8541 case '#': flags |= F_ALT; continue;
8542 case '0': flags |= F_ZERO; continue;
8543 }
8544 break;
8545 }
8546 if (c == '*') {
8547 v = getnextarg(args, arglen, &argidx);
8548 if (v == NULL)
8549 goto onError;
8550 if (!PyInt_Check(v)) {
8551 PyErr_SetString(PyExc_TypeError,
8552 "* wants int");
8553 goto onError;
8554 }
8555 width = PyInt_AsLong(v);
8556 if (width < 0) {
8557 flags |= F_LJUST;
8558 width = -width;
8559 }
8560 if (--fmtcnt >= 0)
8561 c = *fmt++;
8562 }
8563 else if (c >= '0' && c <= '9') {
8564 width = c - '0';
8565 while (--fmtcnt >= 0) {
8566 c = *fmt++;
8567 if (c < '0' || c > '9')
8568 break;
8569 if ((width*10) / 10 != width) {
8570 PyErr_SetString(PyExc_ValueError,
8571 "width too big");
8572 goto onError;
8573 }
8574 width = width*10 + (c - '0');
8575 }
8576 }
8577 if (c == '.') {
8578 prec = 0;
8579 if (--fmtcnt >= 0)
8580 c = *fmt++;
8581 if (c == '*') {
8582 v = getnextarg(args, arglen, &argidx);
8583 if (v == NULL)
8584 goto onError;
8585 if (!PyInt_Check(v)) {
8586 PyErr_SetString(PyExc_TypeError,
8587 "* wants int");
8588 goto onError;
8589 }
8590 prec = PyInt_AsLong(v);
8591 if (prec < 0)
8592 prec = 0;
8593 if (--fmtcnt >= 0)
8594 c = *fmt++;
8595 }
8596 else if (c >= '0' && c <= '9') {
8597 prec = c - '0';
8598 while (--fmtcnt >= 0) {
8599 c = Py_CHARMASK(*fmt++);
8600 if (c < '0' || c > '9')
8601 break;
8602 if ((prec*10) / 10 != prec) {
8603 PyErr_SetString(PyExc_ValueError,
8604 "prec too big");
8605 goto onError;
8606 }
8607 prec = prec*10 + (c - '0');
8608 }
8609 }
8610 } /* prec */
8611 if (fmtcnt >= 0) {
8612 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008613 if (--fmtcnt >= 0)
8614 c = *fmt++;
8615 }
8616 }
8617 if (fmtcnt < 0) {
8618 PyErr_SetString(PyExc_ValueError,
8619 "incomplete format");
8620 goto onError;
8621 }
8622 if (c != '%') {
8623 v = getnextarg(args, arglen, &argidx);
8624 if (v == NULL)
8625 goto onError;
8626 }
8627 sign = 0;
8628 fill = ' ';
8629 switch (c) {
8630
8631 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008632 pbuf = formatbuf;
8633 /* presume that buffer length is at least 1 */
8634 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008635 len = 1;
8636 break;
8637
8638 case 's':
8639 case 'r':
8640 if (PyUnicode_Check(v) && c == 's') {
8641 temp = v;
8642 Py_INCREF(temp);
8643 }
8644 else {
8645 PyObject *unicode;
8646 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008647 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008648 else
8649 temp = PyObject_Repr(v);
8650 if (temp == NULL)
8651 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008652 if (PyUnicode_Check(temp))
8653 /* nothing to do */;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008654 else if (PyString_Check(temp)) {
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008655 /* convert to string to Unicode */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008656 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8657 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00008658 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008659 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008660 Py_DECREF(temp);
8661 temp = unicode;
8662 if (temp == NULL)
8663 goto onError;
8664 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008665 else {
8666 Py_DECREF(temp);
8667 PyErr_SetString(PyExc_TypeError,
8668 "%s argument has non-string str()");
8669 goto onError;
8670 }
8671 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008672 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008673 len = PyUnicode_GET_SIZE(temp);
8674 if (prec >= 0 && len > prec)
8675 len = prec;
8676 break;
8677
8678 case 'i':
8679 case 'd':
8680 case 'u':
8681 case 'o':
8682 case 'x':
8683 case 'X':
8684 if (c == 'i')
8685 c = 'd';
Facundo Batistac11cecf2008-02-24 03:17:21 +00008686 isnumok = 0;
8687 if (PyNumber_Check(v)) {
8688 PyObject *iobj=NULL;
8689
8690 if (PyInt_Check(v) || (PyLong_Check(v))) {
8691 iobj = v;
8692 Py_INCREF(iobj);
8693 }
8694 else {
8695 iobj = PyNumber_Int(v);
8696 if (iobj==NULL) iobj = PyNumber_Long(v);
8697 }
8698 if (iobj!=NULL) {
8699 if (PyInt_Check(iobj)) {
8700 isnumok = 1;
8701 pbuf = formatbuf;
8702 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8703 flags, prec, c, iobj);
8704 Py_DECREF(iobj);
8705 if (len < 0)
8706 goto onError;
8707 sign = 1;
8708 }
8709 else if (PyLong_Check(iobj)) {
8710 isnumok = 1;
8711 temp = formatlong(iobj, flags, prec, c);
8712 Py_DECREF(iobj);
8713 if (!temp)
8714 goto onError;
8715 pbuf = PyUnicode_AS_UNICODE(temp);
8716 len = PyUnicode_GET_SIZE(temp);
8717 sign = 1;
8718 }
8719 else {
8720 Py_DECREF(iobj);
8721 }
8722 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008723 }
Facundo Batistac11cecf2008-02-24 03:17:21 +00008724 if (!isnumok) {
8725 PyErr_Format(PyExc_TypeError,
8726 "%%%c format: a number is required, "
Martin v. Löwisd918e4e2008-04-07 03:08:28 +00008727 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
Tim Peters38fd5b62000-09-21 05:43:11 +00008728 goto onError;
Tim Peters38fd5b62000-09-21 05:43:11 +00008729 }
8730 if (flags & F_ZERO)
8731 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008732 break;
8733
8734 case 'e':
8735 case 'E':
8736 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008737 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008738 case 'g':
8739 case 'G':
Eric Smithd6c393a2008-07-17 19:49:47 +00008740 if (c == 'F')
8741 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008742 pbuf = formatbuf;
8743 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8744 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008745 if (len < 0)
8746 goto onError;
8747 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008748 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008749 fill = '0';
8750 break;
8751
8752 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008753 pbuf = formatbuf;
8754 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008755 if (len < 0)
8756 goto onError;
8757 break;
8758
8759 default:
8760 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008761 "unsupported format character '%c' (0x%x) "
Armin Rigo7ccbca92006-10-04 12:17:45 +00008762 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008763 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008764 (int)c,
Armin Rigo7ccbca92006-10-04 12:17:45 +00008765 (Py_ssize_t)(fmt - 1 -
8766 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008767 goto onError;
8768 }
8769 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008770 if (*pbuf == '-' || *pbuf == '+') {
8771 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008772 len--;
8773 }
8774 else if (flags & F_SIGN)
8775 sign = '+';
8776 else if (flags & F_BLANK)
8777 sign = ' ';
8778 else
8779 sign = 0;
8780 }
8781 if (width < len)
8782 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008783 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008784 reslen -= rescnt;
8785 rescnt = width + fmtcnt + 100;
8786 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008787 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008788 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008789 PyErr_NoMemory();
8790 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008791 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008792 if (_PyUnicode_Resize(&result, reslen) < 0) {
8793 Py_XDECREF(temp);
8794 goto onError;
8795 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008796 res = PyUnicode_AS_UNICODE(result)
8797 + reslen - rescnt;
8798 }
8799 if (sign) {
8800 if (fill != ' ')
8801 *res++ = sign;
8802 rescnt--;
8803 if (width > len)
8804 width--;
8805 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008806 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8807 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008808 assert(pbuf[1] == c);
8809 if (fill != ' ') {
8810 *res++ = *pbuf++;
8811 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008812 }
Tim Petersfff53252001-04-12 18:38:48 +00008813 rescnt -= 2;
8814 width -= 2;
8815 if (width < 0)
8816 width = 0;
8817 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008818 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008819 if (width > len && !(flags & F_LJUST)) {
8820 do {
8821 --rescnt;
8822 *res++ = fill;
8823 } while (--width > len);
8824 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008825 if (fill == ' ') {
8826 if (sign)
8827 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00008828 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008829 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008830 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008831 *res++ = *pbuf++;
8832 *res++ = *pbuf++;
8833 }
8834 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008835 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008836 res += len;
8837 rescnt -= len;
8838 while (--width >= len) {
8839 --rescnt;
8840 *res++ = ' ';
8841 }
8842 if (dict && (argidx < arglen) && c != '%') {
8843 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008844 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008845 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008846 goto onError;
8847 }
8848 Py_XDECREF(temp);
8849 } /* '%' */
8850 } /* until end */
8851 if (argidx < arglen && !dict) {
8852 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008853 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008854 goto onError;
8855 }
8856
Thomas Woutersa96affe2006-03-12 00:29:36 +00008857 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8858 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008859 if (args_owned) {
8860 Py_DECREF(args);
8861 }
8862 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008863 return (PyObject *)result;
8864
8865 onError:
8866 Py_XDECREF(result);
8867 Py_DECREF(uformat);
8868 if (args_owned) {
8869 Py_DECREF(args);
8870 }
8871 return NULL;
8872}
8873
8874static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008875 (readbufferproc) unicode_buffer_getreadbuf,
8876 (writebufferproc) unicode_buffer_getwritebuf,
8877 (segcountproc) unicode_buffer_getsegcount,
8878 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008879};
8880
Jeremy Hylton938ace62002-07-17 16:30:39 +00008881static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008882unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8883
Tim Peters6d6c1a32001-08-02 04:15:00 +00008884static PyObject *
8885unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8886{
8887 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00008888 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008889 char *encoding = NULL;
8890 char *errors = NULL;
8891
Guido van Rossume023fe02001-08-30 03:12:59 +00008892 if (type != &PyUnicode_Type)
8893 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008894 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8895 kwlist, &x, &encoding, &errors))
8896 return NULL;
8897 if (x == NULL)
8898 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008899 if (encoding == NULL && errors == NULL)
8900 return PyObject_Unicode(x);
8901 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008902 return PyUnicode_FromEncodedObject(x, encoding, errors);
8903}
8904
Guido van Rossume023fe02001-08-30 03:12:59 +00008905static PyObject *
8906unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8907{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008908 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008909 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008910
8911 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8912 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8913 if (tmp == NULL)
8914 return NULL;
8915 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008916 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008917 if (pnew == NULL) {
8918 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008919 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008920 }
Neal Norwitz419fd492008-03-17 20:22:43 +00008921 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008922 if (pnew->str == NULL) {
8923 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008924 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008925 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008926 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008927 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008928 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8929 pnew->length = n;
8930 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008931 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008932 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008933}
8934
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008935PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00008936"unicode(string [, encoding[, errors]]) -> object\n\
8937\n\
8938Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008939encoding defaults to the current default string encoding.\n\
8940errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008941
Guido van Rossumd57fd912000-03-10 22:53:23 +00008942PyTypeObject PyUnicode_Type = {
Martin v. Löwis68192102007-07-21 06:55:02 +00008943 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008944 "unicode", /* tp_name */
8945 sizeof(PyUnicodeObject), /* tp_size */
8946 0, /* tp_itemsize */
8947 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008948 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008949 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008950 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008951 0, /* tp_setattr */
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00008952 0, /* tp_compare */
Georg Brandl347b3002006-03-30 11:57:00 +00008953 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008954 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008955 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008956 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008957 (hashfunc) unicode_hash, /* tp_hash*/
8958 0, /* tp_call*/
8959 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008960 PyObject_GenericGetAttr, /* tp_getattro */
8961 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008962 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008963 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
Neal Norwitzee3a1b52007-02-25 19:44:48 +00008964 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008965 unicode_doc, /* tp_doc */
8966 0, /* tp_traverse */
8967 0, /* tp_clear */
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00008968 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008969 0, /* tp_weaklistoffset */
8970 0, /* tp_iter */
8971 0, /* tp_iternext */
8972 unicode_methods, /* tp_methods */
8973 0, /* tp_members */
8974 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008975 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008976 0, /* tp_dict */
8977 0, /* tp_descr_get */
8978 0, /* tp_descr_set */
8979 0, /* tp_dictoffset */
8980 0, /* tp_init */
8981 0, /* tp_alloc */
8982 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008983 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008984};
8985
8986/* Initialize the Unicode implementation */
8987
Thomas Wouters78890102000-07-22 19:25:51 +00008988void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008989{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008990 int i;
8991
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008992 /* XXX - move this array to unicodectype.c ? */
8993 Py_UNICODE linebreak[] = {
8994 0x000A, /* LINE FEED */
8995 0x000D, /* CARRIAGE RETURN */
8996 0x001C, /* FILE SEPARATOR */
8997 0x001D, /* GROUP SEPARATOR */
8998 0x001E, /* RECORD SEPARATOR */
8999 0x0085, /* NEXT LINE */
9000 0x2028, /* LINE SEPARATOR */
9001 0x2029, /* PARAGRAPH SEPARATOR */
9002 };
9003
Fred Drakee4315f52000-05-09 19:53:39 +00009004 /* Init the implementation */
Christian Heimes5b970ad2008-02-06 13:33:44 +00009005 free_list = NULL;
9006 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009007 unicode_empty = _PyUnicode_New(0);
Neal Norwitze1fdb322006-07-21 05:32:28 +00009008 if (!unicode_empty)
9009 return;
9010
Marc-André Lemburg90e81472000-06-07 09:13:21 +00009011 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009012 for (i = 0; i < 256; i++)
9013 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009014 if (PyType_Ready(&PyUnicode_Type) < 0)
9015 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00009016
9017 /* initialize the linebreak bloom filter */
9018 bloom_linebreak = make_bloom_mask(
9019 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9020 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00009021
9022 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009023}
9024
9025/* Finalize the Unicode implementation */
9026
Christian Heimes3b718a72008-02-14 12:47:33 +00009027int
9028PyUnicode_ClearFreeList(void)
9029{
9030 int freelist_size = numfree;
9031 PyUnicodeObject *u;
9032
9033 for (u = free_list; u != NULL;) {
9034 PyUnicodeObject *v = u;
9035 u = *(PyUnicodeObject **)u;
9036 if (v->str)
Neal Norwitz419fd492008-03-17 20:22:43 +00009037 PyObject_DEL(v->str);
Christian Heimes3b718a72008-02-14 12:47:33 +00009038 Py_XDECREF(v->defenc);
9039 PyObject_Del(v);
9040 numfree--;
9041 }
9042 free_list = NULL;
9043 assert(numfree == 0);
9044 return freelist_size;
9045}
9046
Guido van Rossumd57fd912000-03-10 22:53:23 +00009047void
Thomas Wouters78890102000-07-22 19:25:51 +00009048_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009049{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009050 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009051
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009052 Py_XDECREF(unicode_empty);
9053 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009054
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009055 for (i = 0; i < 256; i++) {
9056 if (unicode_latin1[i]) {
9057 Py_DECREF(unicode_latin1[i]);
9058 unicode_latin1[i] = NULL;
9059 }
9060 }
Christian Heimes3b718a72008-02-14 12:47:33 +00009061 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009062}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009063
Anthony Baxterac6bd462006-04-13 02:06:09 +00009064#ifdef __cplusplus
9065}
9066#endif
9067
9068
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009069/*
9070Local variables:
9071c-basic-offset: 4
9072indent-tabs-mode: nil
9073End:
9074*/