blob: dd56e113d26e82aa2e4e3ac1edef62eb81429c0e [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Eric Smitha9f7d622008-02-17 19:46:49 +000045#include "formatter_unicode.h"
46
Guido van Rossumd57fd912000-03-10 22:53:23 +000047#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000048#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000049
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000050#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000051#include <windows.h>
52#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000053
Guido van Rossumd57fd912000-03-10 22:53:23 +000054/* Limit for the Unicode object free list */
55
Christian Heimes5b970ad2008-02-06 13:33:44 +000056#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000057
58/* Limit for the Unicode object free list stay alive optimization.
59
60 The implementation will keep allocated Unicode memory intact for
61 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000062 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000063
Christian Heimes5b970ad2008-02-06 13:33:44 +000064 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000065 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000066 malloc()-overhead) bytes of unused garbage.
67
68 Setting the limit to 0 effectively turns the feature off.
69
Guido van Rossumfd4b9572000-04-10 13:51:10 +000070 Note: This is an experimental feature ! If you get core dumps when
71 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000072
73*/
74
Guido van Rossumfd4b9572000-04-10 13:51:10 +000075#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000076
77/* Endianness switches; defaults to little endian */
78
79#ifdef WORDS_BIGENDIAN
80# define BYTEORDER_IS_BIG_ENDIAN
81#else
82# define BYTEORDER_IS_LITTLE_ENDIAN
83#endif
84
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000085/* --- Globals ------------------------------------------------------------
86
87 The globals are initialized by the _PyUnicode_Init() API and should
88 not be used before calling that API.
89
90*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000091
Anthony Baxterac6bd462006-04-13 02:06:09 +000092
93#ifdef __cplusplus
94extern "C" {
95#endif
96
Guido van Rossumd57fd912000-03-10 22:53:23 +000097/* Free list for Unicode objects */
Christian Heimes5b970ad2008-02-06 13:33:44 +000098static PyUnicodeObject *free_list;
99static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000100
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000101/* The empty Unicode object is shared to improve performance. */
102static PyUnicodeObject *unicode_empty;
103
104/* Single character Unicode strings in the Latin-1 range are being
105 shared as well. */
106static PyUnicodeObject *unicode_latin1[256];
107
Fred Drakee4315f52000-05-09 19:53:39 +0000108/* Default encoding to use and assume when NULL is passed as encoding
109 parameter; it is initialized by _PyUnicode_Init().
110
111 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000112 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000113
114*/
Fred Drakee4315f52000-05-09 19:53:39 +0000115static char unicode_default_encoding[100];
116
Christian Heimes4d4f2702008-01-30 11:32:37 +0000117/* Fast detection of the most frequent whitespace characters */
118const unsigned char _Py_ascii_whitespace[] = {
119 0, 0, 0, 0, 0, 0, 0, 0,
120// case 0x0009: /* HORIZONTAL TABULATION */
121// case 0x000A: /* LINE FEED */
122// case 0x000B: /* VERTICAL TABULATION */
123// case 0x000C: /* FORM FEED */
124// case 0x000D: /* CARRIAGE RETURN */
125 0, 1, 1, 1, 1, 1, 0, 0,
126 0, 0, 0, 0, 0, 0, 0, 0,
127// case 0x001C: /* FILE SEPARATOR */
128// case 0x001D: /* GROUP SEPARATOR */
129// case 0x001E: /* RECORD SEPARATOR */
130// case 0x001F: /* UNIT SEPARATOR */
131 0, 0, 0, 0, 1, 1, 1, 1,
132// case 0x0020: /* SPACE */
133 1, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, 0, 0, 0, 0,
137
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0
146};
147
148/* Same for linebreaks */
149static unsigned char ascii_linebreak[] = {
150 0, 0, 0, 0, 0, 0, 0, 0,
151// 0x000A, /* LINE FEED */
152// 0x000D, /* CARRIAGE RETURN */
153 0, 0, 1, 0, 0, 1, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0,
155// 0x001C, /* FILE SEPARATOR */
156// 0x001D, /* GROUP SEPARATOR */
157// 0x001E, /* RECORD SEPARATOR */
158 0, 0, 0, 0, 1, 1, 1, 0,
159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
161 0, 0, 0, 0, 0, 0, 0, 0,
162 0, 0, 0, 0, 0, 0, 0, 0,
163
164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0
172};
173
174
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000175Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000176PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000177{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000178#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000179 return 0x10FFFF;
180#else
181 /* This is actually an illegal character, so it should
182 not be passed to unichr. */
183 return 0xFFFF;
184#endif
185}
186
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000187/* --- Bloom Filters ----------------------------------------------------- */
188
189/* stuff to implement simple "bloom filters" for Unicode characters.
190 to keep things simple, we use a single bitmask, using the least 5
191 bits from each unicode characters as the bit index. */
192
193/* the linebreak mask is set up by Unicode_Init below */
194
195#define BLOOM_MASK unsigned long
196
197static BLOOM_MASK bloom_linebreak;
198
199#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
200
Christian Heimes4d4f2702008-01-30 11:32:37 +0000201#define BLOOM_LINEBREAK(ch) \
202 ((ch) < 128U ? ascii_linebreak[(ch)] : \
203 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000204
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000205Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000206{
207 /* calculate simple bloom-style bitmask for a given unicode string */
208
209 long mask;
210 Py_ssize_t i;
211
212 mask = 0;
213 for (i = 0; i < len; i++)
214 mask |= (1 << (ptr[i] & 0x1F));
215
216 return mask;
217}
218
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000219Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000220{
221 Py_ssize_t i;
222
223 for (i = 0; i < setlen; i++)
224 if (set[i] == chr)
225 return 1;
226
Fredrik Lundh77633512006-05-23 19:47:35 +0000227 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000228}
229
230#define BLOOM_MEMBER(mask, chr, set, setlen)\
231 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
232
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233/* --- Unicode Object ----------------------------------------------------- */
234
235static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000236int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000237 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000238{
239 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000240
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000241 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000243 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000244
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000245 /* Resizing shared object (unicode_empty or single character
246 objects) in-place is not allowed. Use PyUnicode_Resize()
247 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000248
Martin v. Löwis80d2e592006-04-13 06:06:08 +0000249 if (unicode == unicode_empty ||
250 (unicode->length == 1 &&
251 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000252 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000254 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000255 return -1;
256 }
257
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000258 /* We allocate one more byte to make sure the string is Ux0000 terminated.
259 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000260 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000261 it contains). */
262
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 oldstr = unicode->str;
Neal Norwitz419fd492008-03-17 20:22:43 +0000264 unicode->str = PyObject_REALLOC(unicode->str,
265 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 if (!unicode->str) {
Anthony Baxtera6286212006-04-11 07:42:36 +0000267 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000268 PyErr_NoMemory();
269 return -1;
270 }
271 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000272 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000273
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000274 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000275 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000276 if (unicode->defenc) {
277 Py_DECREF(unicode->defenc);
278 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000279 }
280 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000281
Guido van Rossumd57fd912000-03-10 22:53:23 +0000282 return 0;
283}
284
285/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000286 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000287
288 XXX This allocator could further be enhanced by assuring that the
289 free list never reduces its size below 1.
290
291*/
292
293static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000294PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000295{
296 register PyUnicodeObject *unicode;
297
Andrew Dalkee0df7622006-05-27 11:04:36 +0000298 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299 if (length == 0 && unicode_empty != NULL) {
300 Py_INCREF(unicode_empty);
301 return unicode_empty;
302 }
303
304 /* Unicode freelist & memory allocation */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000305 if (free_list) {
306 unicode = free_list;
307 free_list = *(PyUnicodeObject **)unicode;
308 numfree--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000309 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000310 /* Keep-Alive optimization: we only upsize the buffer,
311 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000312 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000313 unicode_resize(unicode, length) < 0) {
Neal Norwitz419fd492008-03-17 20:22:43 +0000314 PyObject_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000315 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000316 }
317 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000318 else {
Neal Norwitz419fd492008-03-17 20:22:43 +0000319 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
320 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000321 }
322 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000323 }
324 else {
Neal Norwitz419fd492008-03-17 20:22:43 +0000325 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000326 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000327 if (unicode == NULL)
328 return NULL;
Neal Norwitz419fd492008-03-17 20:22:43 +0000329 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
330 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000331 }
332
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000333 if (!unicode->str) {
334 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000335 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000336 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000337 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000338 * the caller fails before initializing str -- unicode_resize()
339 * reads str[0], and the Keep-Alive optimization can keep memory
340 * allocated for str alive across a call to unicode_dealloc(unicode).
341 * We don't want unicode_resize to read uninitialized memory in
342 * that case.
343 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000344 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000345 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000346 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000347 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000348 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000349 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000350
351 onError:
352 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000353 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000354 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000355}
356
357static
Guido van Rossum9475a232001-10-05 20:51:39 +0000358void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000359{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000360 if (PyUnicode_CheckExact(unicode) &&
Christian Heimes5b970ad2008-02-06 13:33:44 +0000361 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000362 /* Keep-Alive optimization */
363 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Neal Norwitz419fd492008-03-17 20:22:43 +0000364 PyObject_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000365 unicode->str = NULL;
366 unicode->length = 0;
367 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000368 if (unicode->defenc) {
369 Py_DECREF(unicode->defenc);
370 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000371 }
372 /* Add to free list */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000373 *(PyUnicodeObject **)unicode = free_list;
374 free_list = unicode;
375 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000376 }
377 else {
Neal Norwitz419fd492008-03-17 20:22:43 +0000378 PyObject_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000379 Py_XDECREF(unicode->defenc);
Christian Heimese93237d2007-12-19 02:37:44 +0000380 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000381 }
382}
383
Martin v. Löwis18e16552006-02-15 17:27:45 +0000384int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000385{
386 register PyUnicodeObject *v;
387
388 /* Argument checks */
389 if (unicode == NULL) {
390 PyErr_BadInternalCall();
391 return -1;
392 }
393 v = (PyUnicodeObject *)*unicode;
Christian Heimese93237d2007-12-19 02:37:44 +0000394 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000395 PyErr_BadInternalCall();
396 return -1;
397 }
398
399 /* Resizing unicode_empty and single character objects is not
400 possible since these are being shared. We simply return a fresh
401 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000402 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000403 (v == unicode_empty || v->length == 1)) {
404 PyUnicodeObject *w = _PyUnicode_New(length);
405 if (w == NULL)
406 return -1;
407 Py_UNICODE_COPY(w->str, v->str,
408 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000409 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000410 *unicode = (PyObject *)w;
411 return 0;
412 }
413
414 /* Note that we don't have to modify *unicode for unshared Unicode
415 objects, since we can modify them in-place. */
416 return unicode_resize(v, length);
417}
418
419/* Internal API for use in unicodeobject.c only ! */
420#define _PyUnicode_Resize(unicodevar, length) \
421 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
422
Guido van Rossumd57fd912000-03-10 22:53:23 +0000423PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000424 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000425{
426 PyUnicodeObject *unicode;
427
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000428 /* If the Unicode data is known at construction time, we can apply
429 some optimizations which share commonly used objects. */
430 if (u != NULL) {
431
432 /* Optimization for empty strings */
433 if (size == 0 && unicode_empty != NULL) {
434 Py_INCREF(unicode_empty);
435 return (PyObject *)unicode_empty;
436 }
437
438 /* Single character Unicode objects in the Latin-1 range are
439 shared when using this constructor */
440 if (size == 1 && *u < 256) {
441 unicode = unicode_latin1[*u];
442 if (!unicode) {
443 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000444 if (!unicode)
445 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000446 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000447 unicode_latin1[*u] = unicode;
448 }
449 Py_INCREF(unicode);
450 return (PyObject *)unicode;
451 }
452 }
Tim Petersced69f82003-09-16 20:30:58 +0000453
Guido van Rossumd57fd912000-03-10 22:53:23 +0000454 unicode = _PyUnicode_New(size);
455 if (!unicode)
456 return NULL;
457
458 /* Copy the Unicode data into the new object */
459 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000460 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000461
462 return (PyObject *)unicode;
463}
464
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000465PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
466{
467 PyUnicodeObject *unicode;
468 /* If the Unicode data is known at construction time, we can apply
469 some optimizations which share commonly used objects.
470 Also, this means the input must be UTF-8, so fall back to the
471 UTF-8 decoder at the end. */
472 if (u != NULL) {
473
474 /* Optimization for empty strings */
475 if (size == 0 && unicode_empty != NULL) {
476 Py_INCREF(unicode_empty);
477 return (PyObject *)unicode_empty;
478 }
479
480 /* Single characters are shared when using this constructor.
481 Restrict to ASCII, since the input must be UTF-8. */
482 if (size == 1 && Py_CHARMASK(*u) < 128) {
483 unicode = unicode_latin1[Py_CHARMASK(*u)];
484 if (!unicode) {
485 unicode = _PyUnicode_New(1);
486 if (!unicode)
487 return NULL;
488 unicode->str[0] = Py_CHARMASK(*u);
489 unicode_latin1[Py_CHARMASK(*u)] = unicode;
490 }
491 Py_INCREF(unicode);
492 return (PyObject *)unicode;
493 }
494
495 return PyUnicode_DecodeUTF8(u, size, NULL);
496 }
497
498 unicode = _PyUnicode_New(size);
499 if (!unicode)
500 return NULL;
501
502 return (PyObject *)unicode;
503}
504
505PyObject *PyUnicode_FromString(const char *u)
506{
507 size_t size = strlen(u);
508 if (size > PY_SSIZE_T_MAX) {
509 PyErr_SetString(PyExc_OverflowError, "input too long");
510 return NULL;
511 }
512
513 return PyUnicode_FromStringAndSize(u, size);
514}
515
Guido van Rossumd57fd912000-03-10 22:53:23 +0000516#ifdef HAVE_WCHAR_H
517
518PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000519 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000520{
521 PyUnicodeObject *unicode;
522
523 if (w == NULL) {
524 PyErr_BadInternalCall();
525 return NULL;
526 }
527
528 unicode = _PyUnicode_New(size);
529 if (!unicode)
530 return NULL;
531
532 /* Copy the wchar_t data into the new object */
533#ifdef HAVE_USABLE_WCHAR_T
534 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000535#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000536 {
537 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000538 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000539 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000540 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000541 *u++ = *w++;
542 }
543#endif
544
545 return (PyObject *)unicode;
546}
547
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000548static void
549makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
550{
551 *fmt++ = '%';
552 if (width) {
553 if (zeropad)
554 *fmt++ = '0';
555 fmt += sprintf(fmt, "%d", width);
556 }
557 if (precision)
558 fmt += sprintf(fmt, ".%d", precision);
559 if (longflag)
560 *fmt++ = 'l';
561 else if (size_tflag) {
562 char *f = PY_FORMAT_SIZE_T;
563 while (*f)
564 *fmt++ = *f++;
565 }
566 *fmt++ = c;
567 *fmt = '\0';
568}
569
570#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
571
572PyObject *
573PyUnicode_FromFormatV(const char *format, va_list vargs)
574{
575 va_list count;
576 Py_ssize_t callcount = 0;
577 PyObject **callresults = NULL;
578 PyObject **callresult = NULL;
579 Py_ssize_t n = 0;
580 int width = 0;
581 int precision = 0;
582 int zeropad;
583 const char* f;
584 Py_UNICODE *s;
585 PyObject *string;
586 /* used by sprintf */
587 char buffer[21];
588 /* use abuffer instead of buffer, if we need more space
589 * (which can happen if there's a format specifier with width). */
590 char *abuffer = NULL;
591 char *realbuffer;
592 Py_ssize_t abuffersize = 0;
593 char fmt[60]; /* should be enough for %0width.precisionld */
594 const char *copy;
595
596#ifdef VA_LIST_IS_ARRAY
597 Py_MEMCPY(count, vargs, sizeof(va_list));
598#else
599#ifdef __va_copy
600 __va_copy(count, vargs);
601#else
602 count = vargs;
603#endif
604#endif
605 /* step 1: count the number of %S/%R format specifications
606 * (we call PyObject_Str()/PyObject_Repr() for these objects
607 * once during step 3 and put the result in an array) */
608 for (f = format; *f; f++) {
609 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
610 ++callcount;
611 }
612 /* step 2: allocate memory for the results of
613 * PyObject_Str()/PyObject_Repr() calls */
614 if (callcount) {
Neal Norwitz419fd492008-03-17 20:22:43 +0000615 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000616 if (!callresults) {
617 PyErr_NoMemory();
618 return NULL;
619 }
620 callresult = callresults;
621 }
622 /* step 3: figure out how large a buffer we need */
623 for (f = format; *f; f++) {
624 if (*f == '%') {
625 const char* p = f;
626 width = 0;
627 while (isdigit(*f))
628 width = (width*10) + *f++ - '0';
629 while (*++f && *f != '%' && !isalpha(*f))
630 ;
631
632 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
633 * they don't affect the amount of space we reserve.
634 */
635 if ((*f == 'l' || *f == 'z') &&
636 (f[1] == 'd' || f[1] == 'u'))
637 ++f;
638
639 switch (*f) {
640 case 'c':
641 (void)va_arg(count, int);
642 /* fall through... */
643 case '%':
644 n++;
645 break;
646 case 'd': case 'u': case 'i': case 'x':
647 (void) va_arg(count, int);
648 /* 20 bytes is enough to hold a 64-bit
649 integer. Decimal takes the most space.
650 This isn't enough for octal.
651 If a width is specified we need more
652 (which we allocate later). */
653 if (width < 20)
654 width = 20;
655 n += width;
656 if (abuffersize < width)
657 abuffersize = width;
658 break;
659 case 's':
660 {
661 /* UTF-8 */
662 unsigned char*s;
663 s = va_arg(count, unsigned char*);
664 while (*s) {
665 if (*s < 128) {
666 n++; s++;
667 } else if (*s < 0xc0) {
668 /* invalid UTF-8 */
669 n++; s++;
670 } else if (*s < 0xc0) {
671 n++;
672 s++; if(!*s)break;
673 s++;
674 } else if (*s < 0xe0) {
675 n++;
676 s++; if(!*s)break;
677 s++; if(!*s)break;
678 s++;
679 } else {
680 #ifdef Py_UNICODE_WIDE
681 n++;
682 #else
683 n+=2;
684 #endif
685 s++; if(!*s)break;
686 s++; if(!*s)break;
687 s++; if(!*s)break;
688 s++;
689 }
690 }
691 break;
692 }
693 case 'U':
694 {
695 PyObject *obj = va_arg(count, PyObject *);
696 assert(obj && PyUnicode_Check(obj));
697 n += PyUnicode_GET_SIZE(obj);
698 break;
699 }
700 case 'V':
701 {
702 PyObject *obj = va_arg(count, PyObject *);
703 const char *str = va_arg(count, const char *);
704 assert(obj || str);
705 assert(!obj || PyUnicode_Check(obj));
706 if (obj)
707 n += PyUnicode_GET_SIZE(obj);
708 else
709 n += strlen(str);
710 break;
711 }
712 case 'S':
713 {
714 PyObject *obj = va_arg(count, PyObject *);
715 PyObject *str;
716 assert(obj);
717 str = PyObject_Str(obj);
718 if (!str)
719 goto fail;
720 n += PyUnicode_GET_SIZE(str);
721 /* Remember the str and switch to the next slot */
722 *callresult++ = str;
723 break;
724 }
725 case 'R':
726 {
727 PyObject *obj = va_arg(count, PyObject *);
728 PyObject *repr;
729 assert(obj);
730 repr = PyObject_Repr(obj);
731 if (!repr)
732 goto fail;
733 n += PyUnicode_GET_SIZE(repr);
734 /* Remember the repr and switch to the next slot */
735 *callresult++ = repr;
736 break;
737 }
738 case 'p':
739 (void) va_arg(count, int);
740 /* maximum 64-bit pointer representation:
741 * 0xffffffffffffffff
742 * so 19 characters is enough.
743 * XXX I count 18 -- what's the extra for?
744 */
745 n += 19;
746 break;
747 default:
748 /* if we stumble upon an unknown
749 formatting code, copy the rest of
750 the format string to the output
751 string. (we cannot just skip the
752 code, since there's no way to know
753 what's in the argument list) */
754 n += strlen(p);
755 goto expand;
756 }
757 } else
758 n++;
759 }
760 expand:
761 if (abuffersize > 20) {
Neal Norwitz419fd492008-03-17 20:22:43 +0000762 abuffer = PyObject_Malloc(abuffersize);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000763 if (!abuffer) {
764 PyErr_NoMemory();
765 goto fail;
766 }
767 realbuffer = abuffer;
768 }
769 else
770 realbuffer = buffer;
771 /* step 4: fill the buffer */
772 /* Since we've analyzed how much space we need for the worst case,
773 we don't have to resize the string.
774 There can be no errors beyond this point. */
775 string = PyUnicode_FromUnicode(NULL, n);
776 if (!string)
777 goto fail;
778
779 s = PyUnicode_AS_UNICODE(string);
780 callresult = callresults;
781
782 for (f = format; *f; f++) {
783 if (*f == '%') {
784 const char* p = f++;
785 int longflag = 0;
786 int size_tflag = 0;
787 zeropad = (*f == '0');
788 /* parse the width.precision part */
789 width = 0;
790 while (isdigit(*f))
791 width = (width*10) + *f++ - '0';
792 precision = 0;
793 if (*f == '.') {
794 f++;
795 while (isdigit(*f))
796 precision = (precision*10) + *f++ - '0';
797 }
798 /* handle the long flag, but only for %ld and %lu.
799 others can be added when necessary. */
800 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
801 longflag = 1;
802 ++f;
803 }
804 /* handle the size_t flag. */
805 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
806 size_tflag = 1;
807 ++f;
808 }
809
810 switch (*f) {
811 case 'c':
812 *s++ = va_arg(vargs, int);
813 break;
814 case 'd':
815 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
816 if (longflag)
817 sprintf(realbuffer, fmt, va_arg(vargs, long));
818 else if (size_tflag)
819 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
820 else
821 sprintf(realbuffer, fmt, va_arg(vargs, int));
822 appendstring(realbuffer);
823 break;
824 case 'u':
825 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
826 if (longflag)
827 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
828 else if (size_tflag)
829 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
830 else
831 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
832 appendstring(realbuffer);
833 break;
834 case 'i':
835 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
836 sprintf(realbuffer, fmt, va_arg(vargs, int));
837 appendstring(realbuffer);
838 break;
839 case 'x':
840 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
841 sprintf(realbuffer, fmt, va_arg(vargs, int));
842 appendstring(realbuffer);
843 break;
844 case 's':
845 {
846 /* Parameter must be UTF-8 encoded.
847 In case of encoding errors, use
848 the replacement character. */
849 PyObject *u;
850 p = va_arg(vargs, char*);
851 u = PyUnicode_DecodeUTF8(p, strlen(p),
852 "replace");
853 if (!u)
854 goto fail;
855 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
856 PyUnicode_GET_SIZE(u));
857 s += PyUnicode_GET_SIZE(u);
858 Py_DECREF(u);
859 break;
860 }
861 case 'U':
862 {
863 PyObject *obj = va_arg(vargs, PyObject *);
864 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
865 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
866 s += size;
867 break;
868 }
869 case 'V':
870 {
871 PyObject *obj = va_arg(vargs, PyObject *);
872 const char *str = va_arg(vargs, const char *);
873 if (obj) {
874 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
875 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
876 s += size;
877 } else {
878 appendstring(str);
879 }
880 break;
881 }
882 case 'S':
883 case 'R':
884 {
885 Py_UNICODE *ucopy;
886 Py_ssize_t usize;
887 Py_ssize_t upos;
888 /* unused, since we already have the result */
889 (void) va_arg(vargs, PyObject *);
890 ucopy = PyUnicode_AS_UNICODE(*callresult);
891 usize = PyUnicode_GET_SIZE(*callresult);
892 for (upos = 0; upos<usize;)
893 *s++ = ucopy[upos++];
894 /* We're done with the unicode()/repr() => forget it */
895 Py_DECREF(*callresult);
896 /* switch to next unicode()/repr() result */
897 ++callresult;
898 break;
899 }
900 case 'p':
901 sprintf(buffer, "%p", va_arg(vargs, void*));
902 /* %p is ill-defined: ensure leading 0x. */
903 if (buffer[1] == 'X')
904 buffer[1] = 'x';
905 else if (buffer[1] != 'x') {
906 memmove(buffer+2, buffer, strlen(buffer)+1);
907 buffer[0] = '0';
908 buffer[1] = 'x';
909 }
910 appendstring(buffer);
911 break;
912 case '%':
913 *s++ = '%';
914 break;
915 default:
916 appendstring(p);
917 goto end;
918 }
919 } else
920 *s++ = *f;
921 }
922
923 end:
924 if (callresults)
Neal Norwitz419fd492008-03-17 20:22:43 +0000925 PyObject_Free(callresults);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000926 if (abuffer)
Neal Norwitz419fd492008-03-17 20:22:43 +0000927 PyObject_Free(abuffer);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000928 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
929 return string;
930 fail:
931 if (callresults) {
932 PyObject **callresult2 = callresults;
933 while (callresult2 < callresult) {
934 Py_DECREF(*callresult2);
935 ++callresult2;
936 }
Neal Norwitz419fd492008-03-17 20:22:43 +0000937 PyObject_Free(callresults);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000938 }
939 if (abuffer)
Neal Norwitz419fd492008-03-17 20:22:43 +0000940 PyObject_Free(abuffer);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000941 return NULL;
942}
943
944#undef appendstring
945
946PyObject *
947PyUnicode_FromFormat(const char *format, ...)
948{
949 PyObject* ret;
950 va_list vargs;
951
952#ifdef HAVE_STDARG_PROTOTYPES
953 va_start(vargs, format);
954#else
955 va_start(vargs);
956#endif
957 ret = PyUnicode_FromFormatV(format, vargs);
958 va_end(vargs);
959 return ret;
960}
961
Martin v. Löwis18e16552006-02-15 17:27:45 +0000962Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
963 wchar_t *w,
964 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000965{
966 if (unicode == NULL) {
967 PyErr_BadInternalCall();
968 return -1;
969 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000970
971 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000972 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000973 size = PyUnicode_GET_SIZE(unicode) + 1;
974
Guido van Rossumd57fd912000-03-10 22:53:23 +0000975#ifdef HAVE_USABLE_WCHAR_T
976 memcpy(w, unicode->str, size * sizeof(wchar_t));
977#else
978 {
979 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000980 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000981 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000982 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000983 *w++ = *u++;
984 }
985#endif
986
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000987 if (size > PyUnicode_GET_SIZE(unicode))
988 return PyUnicode_GET_SIZE(unicode);
989 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000990 return size;
991}
992
993#endif
994
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000995PyObject *PyUnicode_FromOrdinal(int ordinal)
996{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000997 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000998
999#ifdef Py_UNICODE_WIDE
1000 if (ordinal < 0 || ordinal > 0x10ffff) {
1001 PyErr_SetString(PyExc_ValueError,
1002 "unichr() arg not in range(0x110000) "
1003 "(wide Python build)");
1004 return NULL;
1005 }
1006#else
1007 if (ordinal < 0 || ordinal > 0xffff) {
1008 PyErr_SetString(PyExc_ValueError,
1009 "unichr() arg not in range(0x10000) "
1010 "(narrow Python build)");
1011 return NULL;
1012 }
1013#endif
1014
Hye-Shik Chang40574832004-04-06 07:24:51 +00001015 s[0] = (Py_UNICODE)ordinal;
1016 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001017}
1018
Guido van Rossumd57fd912000-03-10 22:53:23 +00001019PyObject *PyUnicode_FromObject(register PyObject *obj)
1020{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001021 /* XXX Perhaps we should make this API an alias of
1022 PyObject_Unicode() instead ?! */
1023 if (PyUnicode_CheckExact(obj)) {
1024 Py_INCREF(obj);
1025 return obj;
1026 }
1027 if (PyUnicode_Check(obj)) {
1028 /* For a Unicode subtype that's not a Unicode object,
1029 return a true Unicode object with the same data. */
1030 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1031 PyUnicode_GET_SIZE(obj));
1032 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001033 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1034}
1035
1036PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1037 const char *encoding,
1038 const char *errors)
1039{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001040 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001041 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001042 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001043
Guido van Rossumd57fd912000-03-10 22:53:23 +00001044 if (obj == NULL) {
1045 PyErr_BadInternalCall();
1046 return NULL;
1047 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001048
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001049#if 0
1050 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001051 that no encodings is given and then redirect to
1052 PyObject_Unicode() which then applies the additional logic for
1053 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001054
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001055 NOTE: This API should really only be used for object which
1056 represent *encoded* Unicode !
1057
1058 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001059 if (PyUnicode_Check(obj)) {
1060 if (encoding) {
1061 PyErr_SetString(PyExc_TypeError,
1062 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001063 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001064 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001065 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001066 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001067#else
1068 if (PyUnicode_Check(obj)) {
1069 PyErr_SetString(PyExc_TypeError,
1070 "decoding Unicode is not supported");
1071 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001072 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001073#endif
1074
1075 /* Coerce object */
1076 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001077 s = PyString_AS_STRING(obj);
1078 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001079 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001080 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1081 /* Overwrite the error message with something more useful in
1082 case of a TypeError. */
1083 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001084 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001085 "coercing to Unicode: need string or buffer, "
1086 "%.80s found",
Christian Heimese93237d2007-12-19 02:37:44 +00001087 Py_TYPE(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001088 goto onError;
1089 }
Tim Petersced69f82003-09-16 20:30:58 +00001090
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001091 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001092 if (len == 0) {
1093 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001094 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001095 }
Tim Petersced69f82003-09-16 20:30:58 +00001096 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001097 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001098
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001099 return v;
1100
1101 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001102 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001103}
1104
1105PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001106 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001107 const char *encoding,
1108 const char *errors)
1109{
1110 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001111
1112 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001113 encoding = PyUnicode_GetDefaultEncoding();
1114
1115 /* Shortcuts for common default encodings */
1116 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001117 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001118 else if (strcmp(encoding, "latin-1") == 0)
1119 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001120#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1121 else if (strcmp(encoding, "mbcs") == 0)
1122 return PyUnicode_DecodeMBCS(s, size, errors);
1123#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001124 else if (strcmp(encoding, "ascii") == 0)
1125 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001126
1127 /* Decode via the codec registry */
1128 buffer = PyBuffer_FromMemory((void *)s, size);
1129 if (buffer == NULL)
1130 goto onError;
1131 unicode = PyCodec_Decode(buffer, encoding, errors);
1132 if (unicode == NULL)
1133 goto onError;
1134 if (!PyUnicode_Check(unicode)) {
1135 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001136 "decoder did not return an unicode object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001137 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001138 Py_DECREF(unicode);
1139 goto onError;
1140 }
1141 Py_DECREF(buffer);
1142 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001143
Guido van Rossumd57fd912000-03-10 22:53:23 +00001144 onError:
1145 Py_XDECREF(buffer);
1146 return NULL;
1147}
1148
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001149PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1150 const char *encoding,
1151 const char *errors)
1152{
1153 PyObject *v;
1154
1155 if (!PyUnicode_Check(unicode)) {
1156 PyErr_BadArgument();
1157 goto onError;
1158 }
1159
1160 if (encoding == NULL)
1161 encoding = PyUnicode_GetDefaultEncoding();
1162
1163 /* Decode via the codec registry */
1164 v = PyCodec_Decode(unicode, encoding, errors);
1165 if (v == NULL)
1166 goto onError;
1167 return v;
1168
1169 onError:
1170 return NULL;
1171}
1172
Guido van Rossumd57fd912000-03-10 22:53:23 +00001173PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001174 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001175 const char *encoding,
1176 const char *errors)
1177{
1178 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001179
Guido van Rossumd57fd912000-03-10 22:53:23 +00001180 unicode = PyUnicode_FromUnicode(s, size);
1181 if (unicode == NULL)
1182 return NULL;
1183 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1184 Py_DECREF(unicode);
1185 return v;
1186}
1187
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001188PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1189 const char *encoding,
1190 const char *errors)
1191{
1192 PyObject *v;
1193
1194 if (!PyUnicode_Check(unicode)) {
1195 PyErr_BadArgument();
1196 goto onError;
1197 }
1198
1199 if (encoding == NULL)
1200 encoding = PyUnicode_GetDefaultEncoding();
1201
1202 /* Encode via the codec registry */
1203 v = PyCodec_Encode(unicode, encoding, errors);
1204 if (v == NULL)
1205 goto onError;
1206 return v;
1207
1208 onError:
1209 return NULL;
1210}
1211
Guido van Rossumd57fd912000-03-10 22:53:23 +00001212PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1213 const char *encoding,
1214 const char *errors)
1215{
1216 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001217
Guido van Rossumd57fd912000-03-10 22:53:23 +00001218 if (!PyUnicode_Check(unicode)) {
1219 PyErr_BadArgument();
1220 goto onError;
1221 }
Fred Drakee4315f52000-05-09 19:53:39 +00001222
Tim Petersced69f82003-09-16 20:30:58 +00001223 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001224 encoding = PyUnicode_GetDefaultEncoding();
1225
1226 /* Shortcuts for common default encodings */
1227 if (errors == NULL) {
1228 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001229 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001230 else if (strcmp(encoding, "latin-1") == 0)
1231 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001232#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1233 else if (strcmp(encoding, "mbcs") == 0)
1234 return PyUnicode_AsMBCSString(unicode);
1235#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001236 else if (strcmp(encoding, "ascii") == 0)
1237 return PyUnicode_AsASCIIString(unicode);
1238 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001239
1240 /* Encode via the codec registry */
1241 v = PyCodec_Encode(unicode, encoding, errors);
1242 if (v == NULL)
1243 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001244 if (!PyString_Check(v)) {
1245 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001246 "encoder did not return a string object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001247 Py_TYPE(v)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001248 Py_DECREF(v);
1249 goto onError;
1250 }
1251 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001252
Guido van Rossumd57fd912000-03-10 22:53:23 +00001253 onError:
1254 return NULL;
1255}
1256
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001257PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1258 const char *errors)
1259{
1260 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1261
1262 if (v)
1263 return v;
1264 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1265 if (v && errors == NULL)
1266 ((PyUnicodeObject *)unicode)->defenc = v;
1267 return v;
1268}
1269
Guido van Rossumd57fd912000-03-10 22:53:23 +00001270Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1271{
1272 if (!PyUnicode_Check(unicode)) {
1273 PyErr_BadArgument();
1274 goto onError;
1275 }
1276 return PyUnicode_AS_UNICODE(unicode);
1277
1278 onError:
1279 return NULL;
1280}
1281
Martin v. Löwis18e16552006-02-15 17:27:45 +00001282Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001283{
1284 if (!PyUnicode_Check(unicode)) {
1285 PyErr_BadArgument();
1286 goto onError;
1287 }
1288 return PyUnicode_GET_SIZE(unicode);
1289
1290 onError:
1291 return -1;
1292}
1293
Thomas Wouters78890102000-07-22 19:25:51 +00001294const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001295{
1296 return unicode_default_encoding;
1297}
1298
1299int PyUnicode_SetDefaultEncoding(const char *encoding)
1300{
1301 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001302
Fred Drakee4315f52000-05-09 19:53:39 +00001303 /* Make sure the encoding is valid. As side effect, this also
1304 loads the encoding into the codec registry cache. */
1305 v = _PyCodec_Lookup(encoding);
1306 if (v == NULL)
1307 goto onError;
1308 Py_DECREF(v);
1309 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +00001310 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +00001311 sizeof(unicode_default_encoding));
1312 return 0;
1313
1314 onError:
1315 return -1;
1316}
1317
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001318/* error handling callback helper:
1319 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001320 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001321 and adjust various state variables.
1322 return 0 on success, -1 on error
1323*/
1324
1325static
1326int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1327 const char *encoding, const char *reason,
Walter Dörwald87578782007-08-30 15:30:09 +00001328 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1329 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001330 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001331{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001332 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001333
1334 PyObject *restuple = NULL;
1335 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001336 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1337 Py_ssize_t requiredsize;
1338 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001339 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001340 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001341 int res = -1;
1342
1343 if (*errorHandler == NULL) {
1344 *errorHandler = PyCodec_LookupError(errors);
1345 if (*errorHandler == NULL)
1346 goto onError;
1347 }
1348
1349 if (*exceptionObject == NULL) {
1350 *exceptionObject = PyUnicodeDecodeError_Create(
1351 encoding, input, insize, *startinpos, *endinpos, reason);
1352 if (*exceptionObject == NULL)
1353 goto onError;
1354 }
1355 else {
1356 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1357 goto onError;
1358 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1359 goto onError;
1360 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1361 goto onError;
1362 }
1363
1364 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1365 if (restuple == NULL)
1366 goto onError;
1367 if (!PyTuple_Check(restuple)) {
1368 PyErr_Format(PyExc_TypeError, &argparse[4]);
1369 goto onError;
1370 }
1371 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1372 goto onError;
1373 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001374 newpos = insize+newpos;
1375 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001376 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001377 goto onError;
1378 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001379
1380 /* need more space? (at least enough for what we
1381 have+the replacement+the rest of the string (starting
1382 at the new input position), so we won't have to check space
1383 when there are no errors in the rest of the string) */
1384 repptr = PyUnicode_AS_UNICODE(repunicode);
1385 repsize = PyUnicode_GET_SIZE(repunicode);
1386 requiredsize = *outpos + repsize + insize-newpos;
1387 if (requiredsize > outsize) {
1388 if (requiredsize<2*outsize)
1389 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001390 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001391 goto onError;
1392 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1393 }
1394 *endinpos = newpos;
1395 *inptr = input + newpos;
1396 Py_UNICODE_COPY(*outptr, repptr, repsize);
1397 *outptr += repsize;
1398 *outpos += repsize;
1399 /* we made it! */
1400 res = 0;
1401
1402 onError:
1403 Py_XDECREF(restuple);
1404 return res;
1405}
1406
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001407/* --- UTF-7 Codec -------------------------------------------------------- */
1408
1409/* see RFC2152 for details */
1410
Tim Petersced69f82003-09-16 20:30:58 +00001411static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001412char utf7_special[128] = {
1413 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1414 encoded:
1415 0 - not special
1416 1 - special
1417 2 - whitespace (optional)
1418 3 - RFC2152 Set O (optional) */
1419 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1420 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1421 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1422 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1423 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1424 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1425 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1426 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1427
1428};
1429
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001430/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1431 warnings about the comparison always being false; since
1432 utf7_special[0] is 1, we can safely make that one comparison
1433 true */
1434
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001435#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001436 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001437 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001438 (encodeO && (utf7_special[(c)] == 3)))
1439
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001440#define B64(n) \
1441 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1442#define B64CHAR(c) \
1443 (isalnum(c) || (c) == '+' || (c) == '/')
1444#define UB64(c) \
1445 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1446 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001447
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001448#define ENCODE(out, ch, bits) \
1449 while (bits >= 6) { \
1450 *out++ = B64(ch >> (bits-6)); \
1451 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001452 }
1453
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001454#define DECODE(out, ch, bits, surrogate) \
1455 while (bits >= 16) { \
1456 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1457 bits -= 16; \
1458 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001459 /* We have already generated an error for the high surrogate \
1460 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001461 surrogate = 0; \
1462 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001463 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001464 it in a 16-bit character */ \
1465 surrogate = 1; \
1466 errmsg = "code pairs are not supported"; \
1467 goto utf7Error; \
1468 } else { \
1469 *out++ = outCh; \
1470 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001471 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001472
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001473PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001474 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001475 const char *errors)
1476{
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001477 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1478}
1479
1480PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1481 Py_ssize_t size,
1482 const char *errors,
1483 Py_ssize_t *consumed)
1484{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001485 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001486 Py_ssize_t startinpos;
1487 Py_ssize_t endinpos;
1488 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001489 const char *e;
1490 PyUnicodeObject *unicode;
1491 Py_UNICODE *p;
1492 const char *errmsg = "";
1493 int inShift = 0;
1494 unsigned int bitsleft = 0;
1495 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001496 int surrogate = 0;
1497 PyObject *errorHandler = NULL;
1498 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001499
1500 unicode = _PyUnicode_New(size);
1501 if (!unicode)
1502 return NULL;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001503 if (size == 0) {
1504 if (consumed)
1505 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001506 return (PyObject *)unicode;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001507 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001508
1509 p = unicode->str;
1510 e = s + size;
1511
1512 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001513 Py_UNICODE ch;
1514 restart:
1515 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001516
1517 if (inShift) {
1518 if ((ch == '-') || !B64CHAR(ch)) {
1519 inShift = 0;
1520 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001521
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001522 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1523 if (bitsleft >= 6) {
1524 /* The shift sequence has a partial character in it. If
1525 bitsleft < 6 then we could just classify it as padding
1526 but that is not the case here */
1527
1528 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001529 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001530 }
1531 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001532 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001533 here so indicate the potential of a misencoded character. */
1534
1535 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1536 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1537 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001538 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001539 }
1540
1541 if (ch == '-') {
1542 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001543 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001544 inShift = 1;
1545 }
1546 } else if (SPECIAL(ch,0,0)) {
1547 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001548 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001549 } else {
1550 *p++ = ch;
1551 }
1552 } else {
1553 charsleft = (charsleft << 6) | UB64(ch);
1554 bitsleft += 6;
1555 s++;
1556 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1557 }
1558 }
1559 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001560 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001561 s++;
1562 if (s < e && *s == '-') {
1563 s++;
1564 *p++ = '+';
1565 } else
1566 {
1567 inShift = 1;
1568 bitsleft = 0;
1569 }
1570 }
1571 else if (SPECIAL(ch,0,0)) {
Walter Dörwald9d045422007-08-30 15:34:55 +00001572 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001573 errmsg = "unexpected special character";
1574 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001575 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001576 }
1577 else {
1578 *p++ = ch;
1579 s++;
1580 }
1581 continue;
1582 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001583 outpos = p-PyUnicode_AS_UNICODE(unicode);
1584 endinpos = s-starts;
1585 if (unicode_decode_call_errorhandler(
1586 errors, &errorHandler,
1587 "utf7", errmsg,
1588 starts, size, &startinpos, &endinpos, &exc, &s,
1589 (PyObject **)&unicode, &outpos, &p))
1590 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001591 }
1592
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001593 if (inShift && !consumed) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001594 outpos = p-PyUnicode_AS_UNICODE(unicode);
1595 endinpos = size;
1596 if (unicode_decode_call_errorhandler(
1597 errors, &errorHandler,
1598 "utf7", "unterminated shift sequence",
1599 starts, size, &startinpos, &endinpos, &exc, &s,
1600 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001601 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001602 if (s < e)
1603 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001604 }
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001605 if (consumed) {
1606 if(inShift)
1607 *consumed = startinpos;
1608 else
1609 *consumed = s-starts;
1610 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001611
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001612 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001613 goto onError;
1614
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001615 Py_XDECREF(errorHandler);
1616 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001617 return (PyObject *)unicode;
1618
1619onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001620 Py_XDECREF(errorHandler);
1621 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001622 Py_DECREF(unicode);
1623 return NULL;
1624}
1625
1626
1627PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001628 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001629 int encodeSetO,
1630 int encodeWhiteSpace,
1631 const char *errors)
1632{
1633 PyObject *v;
1634 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001635 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001636 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001637 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001638 unsigned int bitsleft = 0;
1639 unsigned long charsleft = 0;
1640 char * out;
1641 char * start;
1642
1643 if (size == 0)
1644 return PyString_FromStringAndSize(NULL, 0);
1645
1646 v = PyString_FromStringAndSize(NULL, cbAllocated);
1647 if (v == NULL)
1648 return NULL;
1649
1650 start = out = PyString_AS_STRING(v);
1651 for (;i < size; ++i) {
1652 Py_UNICODE ch = s[i];
1653
1654 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001655 if (ch == '+') {
1656 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001657 *out++ = '-';
1658 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1659 charsleft = ch;
1660 bitsleft = 16;
1661 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001662 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001663 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001664 } else {
1665 *out++ = (char) ch;
1666 }
1667 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001668 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1669 *out++ = B64(charsleft << (6-bitsleft));
1670 charsleft = 0;
1671 bitsleft = 0;
1672 /* Characters not in the BASE64 set implicitly unshift the sequence
1673 so no '-' is required, except if the character is itself a '-' */
1674 if (B64CHAR(ch) || ch == '-') {
1675 *out++ = '-';
1676 }
1677 inShift = 0;
1678 *out++ = (char) ch;
1679 } else {
1680 bitsleft += 16;
1681 charsleft = (charsleft << 16) | ch;
1682 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1683
1684 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001685 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001686 or '-' then the shift sequence will be terminated implicitly and we
1687 don't have to insert a '-'. */
1688
1689 if (bitsleft == 0) {
1690 if (i + 1 < size) {
1691 Py_UNICODE ch2 = s[i+1];
1692
1693 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001694
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001695 } else if (B64CHAR(ch2) || ch2 == '-') {
1696 *out++ = '-';
1697 inShift = 0;
1698 } else {
1699 inShift = 0;
1700 }
1701
1702 }
1703 else {
1704 *out++ = '-';
1705 inShift = 0;
1706 }
1707 }
Tim Petersced69f82003-09-16 20:30:58 +00001708 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001709 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001710 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001711 if (bitsleft) {
1712 *out++= B64(charsleft << (6-bitsleft) );
1713 *out++ = '-';
1714 }
1715
Tim Peters5de98422002-04-27 18:44:32 +00001716 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001717 return v;
1718}
1719
1720#undef SPECIAL
1721#undef B64
1722#undef B64CHAR
1723#undef UB64
1724#undef ENCODE
1725#undef DECODE
1726
Guido van Rossumd57fd912000-03-10 22:53:23 +00001727/* --- UTF-8 Codec -------------------------------------------------------- */
1728
Tim Petersced69f82003-09-16 20:30:58 +00001729static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001730char utf8_code_length[256] = {
1731 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1732 illegal prefix. see RFC 2279 for details */
1733 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1734 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1735 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1736 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1737 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1738 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1739 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1740 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1741 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1742 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1743 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1744 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1745 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1746 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1747 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1748 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1749};
1750
Guido van Rossumd57fd912000-03-10 22:53:23 +00001751PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001752 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001753 const char *errors)
1754{
Walter Dörwald69652032004-09-07 20:24:22 +00001755 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1756}
1757
1758PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001759 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001760 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001761 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001762{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001763 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001764 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001765 Py_ssize_t startinpos;
1766 Py_ssize_t endinpos;
1767 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001768 const char *e;
1769 PyUnicodeObject *unicode;
1770 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001771 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001772 PyObject *errorHandler = NULL;
1773 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001774
1775 /* Note: size will always be longer than the resulting Unicode
1776 character count */
1777 unicode = _PyUnicode_New(size);
1778 if (!unicode)
1779 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001780 if (size == 0) {
1781 if (consumed)
1782 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001783 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001784 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001785
1786 /* Unpack UTF-8 encoded data */
1787 p = unicode->str;
1788 e = s + size;
1789
1790 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001791 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001792
1793 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001794 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001795 s++;
1796 continue;
1797 }
1798
1799 n = utf8_code_length[ch];
1800
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001801 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001802 if (consumed)
1803 break;
1804 else {
1805 errmsg = "unexpected end of data";
1806 startinpos = s-starts;
1807 endinpos = size;
1808 goto utf8Error;
1809 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001810 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001811
1812 switch (n) {
1813
1814 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001815 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001816 startinpos = s-starts;
1817 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001818 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001819
1820 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001821 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001822 startinpos = s-starts;
1823 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001824 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001825
1826 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001827 if ((s[1] & 0xc0) != 0x80) {
1828 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001829 startinpos = s-starts;
1830 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001831 goto utf8Error;
1832 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001833 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001834 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001835 startinpos = s-starts;
1836 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001837 errmsg = "illegal encoding";
1838 goto utf8Error;
1839 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001840 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001841 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001842 break;
1843
1844 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001845 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001846 (s[2] & 0xc0) != 0x80) {
1847 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001848 startinpos = s-starts;
1849 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001850 goto utf8Error;
1851 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001852 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001853 if (ch < 0x0800) {
1854 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001855 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001856
1857 XXX For wide builds (UCS-4) we should probably try
1858 to recombine the surrogates into a single code
1859 unit.
1860 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001861 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001862 startinpos = s-starts;
1863 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001864 goto utf8Error;
1865 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001866 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001867 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001868 break;
1869
1870 case 4:
1871 if ((s[1] & 0xc0) != 0x80 ||
1872 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001873 (s[3] & 0xc0) != 0x80) {
1874 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001875 startinpos = s-starts;
1876 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001877 goto utf8Error;
1878 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001879 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1880 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1881 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001882 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001883 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001884 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001885 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001886 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001887 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001888 startinpos = s-starts;
1889 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001890 goto utf8Error;
1891 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001892#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001893 *p++ = (Py_UNICODE)ch;
1894#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001895 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001896
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001897 /* translate from 10000..10FFFF to 0..FFFF */
1898 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001899
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001900 /* high surrogate = top 10 bits added to D800 */
1901 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001902
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001903 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001904 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001905#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001906 break;
1907
1908 default:
1909 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001910 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001911 startinpos = s-starts;
1912 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001913 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001914 }
1915 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001916 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001917
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001918 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001919 outpos = p-PyUnicode_AS_UNICODE(unicode);
1920 if (unicode_decode_call_errorhandler(
1921 errors, &errorHandler,
1922 "utf8", errmsg,
1923 starts, size, &startinpos, &endinpos, &exc, &s,
1924 (PyObject **)&unicode, &outpos, &p))
1925 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001926 }
Walter Dörwald69652032004-09-07 20:24:22 +00001927 if (consumed)
1928 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001929
1930 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001931 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001932 goto onError;
1933
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001934 Py_XDECREF(errorHandler);
1935 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001936 return (PyObject *)unicode;
1937
1938onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001939 Py_XDECREF(errorHandler);
1940 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001941 Py_DECREF(unicode);
1942 return NULL;
1943}
1944
Tim Peters602f7402002-04-27 18:03:26 +00001945/* Allocation strategy: if the string is short, convert into a stack buffer
1946 and allocate exactly as much space needed at the end. Else allocate the
1947 maximum possible needed (4 result bytes per Unicode character), and return
1948 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001949*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001950PyObject *
1951PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001952 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001953 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001954{
Tim Peters602f7402002-04-27 18:03:26 +00001955#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001956
Martin v. Löwis18e16552006-02-15 17:27:45 +00001957 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001958 PyObject *v; /* result string object */
1959 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001960 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001961 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001962 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001963
Tim Peters602f7402002-04-27 18:03:26 +00001964 assert(s != NULL);
1965 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001966
Tim Peters602f7402002-04-27 18:03:26 +00001967 if (size <= MAX_SHORT_UNICHARS) {
1968 /* Write into the stack buffer; nallocated can't overflow.
1969 * At the end, we'll allocate exactly as much heap space as it
1970 * turns out we need.
1971 */
1972 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1973 v = NULL; /* will allocate after we're done */
1974 p = stackbuf;
1975 }
1976 else {
1977 /* Overallocate on the heap, and give the excess back at the end. */
1978 nallocated = size * 4;
1979 if (nallocated / 4 != size) /* overflow! */
1980 return PyErr_NoMemory();
1981 v = PyString_FromStringAndSize(NULL, nallocated);
1982 if (v == NULL)
1983 return NULL;
1984 p = PyString_AS_STRING(v);
1985 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001986
Tim Peters602f7402002-04-27 18:03:26 +00001987 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001988 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001989
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001990 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001991 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001992 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001993
Guido van Rossumd57fd912000-03-10 22:53:23 +00001994 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001995 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001996 *p++ = (char)(0xc0 | (ch >> 6));
1997 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001998 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001999 else {
Tim Peters602f7402002-04-27 18:03:26 +00002000 /* Encode UCS2 Unicode ordinals */
2001 if (ch < 0x10000) {
2002 /* Special case: check for high surrogate */
2003 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2004 Py_UCS4 ch2 = s[i];
2005 /* Check for low surrogate and combine the two to
2006 form a UCS4 value */
2007 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002008 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002009 i++;
2010 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002011 }
Tim Peters602f7402002-04-27 18:03:26 +00002012 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002013 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002014 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002015 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2016 *p++ = (char)(0x80 | (ch & 0x3f));
2017 continue;
2018 }
2019encodeUCS4:
2020 /* Encode UCS4 Unicode ordinals */
2021 *p++ = (char)(0xf0 | (ch >> 18));
2022 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2023 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2024 *p++ = (char)(0x80 | (ch & 0x3f));
2025 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002026 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002027
Tim Peters602f7402002-04-27 18:03:26 +00002028 if (v == NULL) {
2029 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002030 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002031 assert(nneeded <= nallocated);
2032 v = PyString_FromStringAndSize(stackbuf, nneeded);
2033 }
2034 else {
2035 /* Cut back to size actually needed. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002036 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002037 assert(nneeded <= nallocated);
2038 _PyString_Resize(&v, nneeded);
2039 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002040 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002041
Tim Peters602f7402002-04-27 18:03:26 +00002042#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002043}
2044
Guido van Rossumd57fd912000-03-10 22:53:23 +00002045PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2046{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002047 if (!PyUnicode_Check(unicode)) {
2048 PyErr_BadArgument();
2049 return NULL;
2050 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002051 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2052 PyUnicode_GET_SIZE(unicode),
2053 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002054}
2055
Walter Dörwald6e390802007-08-17 16:41:28 +00002056/* --- UTF-32 Codec ------------------------------------------------------- */
2057
2058PyObject *
2059PyUnicode_DecodeUTF32(const char *s,
2060 Py_ssize_t size,
2061 const char *errors,
2062 int *byteorder)
2063{
2064 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2065}
2066
2067PyObject *
2068PyUnicode_DecodeUTF32Stateful(const char *s,
2069 Py_ssize_t size,
2070 const char *errors,
2071 int *byteorder,
2072 Py_ssize_t *consumed)
2073{
2074 const char *starts = s;
2075 Py_ssize_t startinpos;
2076 Py_ssize_t endinpos;
2077 Py_ssize_t outpos;
2078 PyUnicodeObject *unicode;
2079 Py_UNICODE *p;
2080#ifndef Py_UNICODE_WIDE
2081 int i, pairs;
2082#else
2083 const int pairs = 0;
2084#endif
2085 const unsigned char *q, *e;
2086 int bo = 0; /* assume native ordering by default */
2087 const char *errmsg = "";
Walter Dörwald20b40d32007-08-17 16:52:50 +00002088 /* Offsets from q for retrieving bytes in the right order. */
2089#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2090 int iorder[] = {0, 1, 2, 3};
2091#else
2092 int iorder[] = {3, 2, 1, 0};
2093#endif
Walter Dörwald9ab80a92007-08-17 16:58:43 +00002094 PyObject *errorHandler = NULL;
2095 PyObject *exc = NULL;
Walter Dörwald6e390802007-08-17 16:41:28 +00002096 /* On narrow builds we split characters outside the BMP into two
2097 codepoints => count how much extra space we need. */
2098#ifndef Py_UNICODE_WIDE
2099 for (i = pairs = 0; i < size/4; i++)
2100 if (((Py_UCS4 *)s)[i] >= 0x10000)
2101 pairs++;
2102#endif
Walter Dörwald6e390802007-08-17 16:41:28 +00002103
2104 /* This might be one to much, because of a BOM */
2105 unicode = _PyUnicode_New((size+3)/4+pairs);
2106 if (!unicode)
2107 return NULL;
2108 if (size == 0)
2109 return (PyObject *)unicode;
2110
2111 /* Unpack UTF-32 encoded data */
2112 p = unicode->str;
2113 q = (unsigned char *)s;
2114 e = q + size;
2115
2116 if (byteorder)
2117 bo = *byteorder;
2118
2119 /* Check for BOM marks (U+FEFF) in the input and adjust current
2120 byte order setting accordingly. In native mode, the leading BOM
2121 mark is skipped, in all other modes, it is copied to the output
2122 stream as-is (giving a ZWNBSP character). */
2123 if (bo == 0) {
2124 if (size >= 4) {
2125 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2126 (q[iorder[1]] << 8) | q[iorder[0]];
2127#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2128 if (bom == 0x0000FEFF) {
2129 q += 4;
2130 bo = -1;
2131 }
2132 else if (bom == 0xFFFE0000) {
2133 q += 4;
2134 bo = 1;
2135 }
2136#else
2137 if (bom == 0x0000FEFF) {
2138 q += 4;
2139 bo = 1;
2140 }
2141 else if (bom == 0xFFFE0000) {
2142 q += 4;
2143 bo = -1;
2144 }
2145#endif
2146 }
2147 }
2148
2149 if (bo == -1) {
2150 /* force LE */
2151 iorder[0] = 0;
2152 iorder[1] = 1;
2153 iorder[2] = 2;
2154 iorder[3] = 3;
2155 }
2156 else if (bo == 1) {
2157 /* force BE */
2158 iorder[0] = 3;
2159 iorder[1] = 2;
2160 iorder[2] = 1;
2161 iorder[3] = 0;
2162 }
2163
2164 while (q < e) {
2165 Py_UCS4 ch;
2166 /* remaining bytes at the end? (size should be divisible by 4) */
2167 if (e-q<4) {
2168 if (consumed)
2169 break;
2170 errmsg = "truncated data";
2171 startinpos = ((const char *)q)-starts;
2172 endinpos = ((const char *)e)-starts;
2173 goto utf32Error;
2174 /* The remaining input chars are ignored if the callback
2175 chooses to skip the input */
2176 }
2177 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2178 (q[iorder[1]] << 8) | q[iorder[0]];
2179
2180 if (ch >= 0x110000)
2181 {
2182 errmsg = "codepoint not in range(0x110000)";
2183 startinpos = ((const char *)q)-starts;
2184 endinpos = startinpos+4;
2185 goto utf32Error;
2186 }
2187#ifndef Py_UNICODE_WIDE
2188 if (ch >= 0x10000)
2189 {
2190 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2191 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2192 }
2193 else
2194#endif
2195 *p++ = ch;
2196 q += 4;
2197 continue;
2198 utf32Error:
2199 outpos = p-PyUnicode_AS_UNICODE(unicode);
2200 if (unicode_decode_call_errorhandler(
2201 errors, &errorHandler,
2202 "utf32", errmsg,
2203 starts, size, &startinpos, &endinpos, &exc, &s,
2204 (PyObject **)&unicode, &outpos, &p))
2205 goto onError;
2206 }
2207
2208 if (byteorder)
2209 *byteorder = bo;
2210
2211 if (consumed)
2212 *consumed = (const char *)q-starts;
2213
2214 /* Adjust length */
2215 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2216 goto onError;
2217
2218 Py_XDECREF(errorHandler);
2219 Py_XDECREF(exc);
2220 return (PyObject *)unicode;
2221
2222onError:
2223 Py_DECREF(unicode);
2224 Py_XDECREF(errorHandler);
2225 Py_XDECREF(exc);
2226 return NULL;
2227}
2228
2229PyObject *
2230PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2231 Py_ssize_t size,
2232 const char *errors,
2233 int byteorder)
2234{
2235 PyObject *v;
2236 unsigned char *p;
2237#ifndef Py_UNICODE_WIDE
2238 int i, pairs;
2239#else
2240 const int pairs = 0;
2241#endif
2242 /* Offsets from p for storing byte pairs in the right order. */
2243#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2244 int iorder[] = {0, 1, 2, 3};
2245#else
2246 int iorder[] = {3, 2, 1, 0};
2247#endif
2248
2249#define STORECHAR(CH) \
2250 do { \
2251 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2252 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2253 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2254 p[iorder[0]] = (CH) & 0xff; \
2255 p += 4; \
2256 } while(0)
2257
2258 /* In narrow builds we can output surrogate pairs as one codepoint,
2259 so we need less space. */
2260#ifndef Py_UNICODE_WIDE
2261 for (i = pairs = 0; i < size-1; i++)
2262 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2263 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2264 pairs++;
2265#endif
2266 v = PyString_FromStringAndSize(NULL,
2267 4 * (size - pairs + (byteorder == 0)));
2268 if (v == NULL)
2269 return NULL;
2270
2271 p = (unsigned char *)PyString_AS_STRING(v);
2272 if (byteorder == 0)
2273 STORECHAR(0xFEFF);
2274 if (size == 0)
2275 return v;
2276
2277 if (byteorder == -1) {
2278 /* force LE */
2279 iorder[0] = 0;
2280 iorder[1] = 1;
2281 iorder[2] = 2;
2282 iorder[3] = 3;
2283 }
2284 else if (byteorder == 1) {
2285 /* force BE */
2286 iorder[0] = 3;
2287 iorder[1] = 2;
2288 iorder[2] = 1;
2289 iorder[3] = 0;
2290 }
2291
2292 while (size-- > 0) {
2293 Py_UCS4 ch = *s++;
2294#ifndef Py_UNICODE_WIDE
2295 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2296 Py_UCS4 ch2 = *s;
2297 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2298 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2299 s++;
2300 size--;
2301 }
2302 }
2303#endif
2304 STORECHAR(ch);
2305 }
2306 return v;
2307#undef STORECHAR
2308}
2309
2310PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2311{
2312 if (!PyUnicode_Check(unicode)) {
2313 PyErr_BadArgument();
2314 return NULL;
2315 }
2316 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2317 PyUnicode_GET_SIZE(unicode),
2318 NULL,
2319 0);
2320}
2321
Guido van Rossumd57fd912000-03-10 22:53:23 +00002322/* --- UTF-16 Codec ------------------------------------------------------- */
2323
Tim Peters772747b2001-08-09 22:21:55 +00002324PyObject *
2325PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002326 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002327 const char *errors,
2328 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002329{
Walter Dörwald69652032004-09-07 20:24:22 +00002330 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2331}
2332
2333PyObject *
2334PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002335 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002336 const char *errors,
2337 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002338 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002339{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002340 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002341 Py_ssize_t startinpos;
2342 Py_ssize_t endinpos;
2343 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002344 PyUnicodeObject *unicode;
2345 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002346 const unsigned char *q, *e;
2347 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002348 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002349 /* Offsets from q for retrieving byte pairs in the right order. */
2350#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2351 int ihi = 1, ilo = 0;
2352#else
2353 int ihi = 0, ilo = 1;
2354#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002355 PyObject *errorHandler = NULL;
2356 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002357
2358 /* Note: size will always be longer than the resulting Unicode
2359 character count */
2360 unicode = _PyUnicode_New(size);
2361 if (!unicode)
2362 return NULL;
2363 if (size == 0)
2364 return (PyObject *)unicode;
2365
2366 /* Unpack UTF-16 encoded data */
2367 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002368 q = (unsigned char *)s;
2369 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002370
2371 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002372 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002373
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002374 /* Check for BOM marks (U+FEFF) in the input and adjust current
2375 byte order setting accordingly. In native mode, the leading BOM
2376 mark is skipped, in all other modes, it is copied to the output
2377 stream as-is (giving a ZWNBSP character). */
2378 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002379 if (size >= 2) {
2380 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002381#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002382 if (bom == 0xFEFF) {
2383 q += 2;
2384 bo = -1;
2385 }
2386 else if (bom == 0xFFFE) {
2387 q += 2;
2388 bo = 1;
2389 }
Tim Petersced69f82003-09-16 20:30:58 +00002390#else
Walter Dörwald69652032004-09-07 20:24:22 +00002391 if (bom == 0xFEFF) {
2392 q += 2;
2393 bo = 1;
2394 }
2395 else if (bom == 0xFFFE) {
2396 q += 2;
2397 bo = -1;
2398 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002399#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002400 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002401 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002402
Tim Peters772747b2001-08-09 22:21:55 +00002403 if (bo == -1) {
2404 /* force LE */
2405 ihi = 1;
2406 ilo = 0;
2407 }
2408 else if (bo == 1) {
2409 /* force BE */
2410 ihi = 0;
2411 ilo = 1;
2412 }
2413
2414 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002415 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002416 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002417 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002418 if (consumed)
2419 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002420 errmsg = "truncated data";
2421 startinpos = ((const char *)q)-starts;
2422 endinpos = ((const char *)e)-starts;
2423 goto utf16Error;
2424 /* The remaining input chars are ignored if the callback
2425 chooses to skip the input */
2426 }
2427 ch = (q[ihi] << 8) | q[ilo];
2428
Tim Peters772747b2001-08-09 22:21:55 +00002429 q += 2;
2430
Guido van Rossumd57fd912000-03-10 22:53:23 +00002431 if (ch < 0xD800 || ch > 0xDFFF) {
2432 *p++ = ch;
2433 continue;
2434 }
2435
2436 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002437 if (q >= e) {
2438 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002439 startinpos = (((const char *)q)-2)-starts;
2440 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002441 goto utf16Error;
2442 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002443 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002444 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2445 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002446 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002447#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002448 *p++ = ch;
2449 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002450#else
2451 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002452#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002453 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002454 }
2455 else {
2456 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002457 startinpos = (((const char *)q)-4)-starts;
2458 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002459 goto utf16Error;
2460 }
2461
Guido van Rossumd57fd912000-03-10 22:53:23 +00002462 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002463 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002464 startinpos = (((const char *)q)-2)-starts;
2465 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002466 /* Fall through to report the error */
2467
2468 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002469 outpos = p-PyUnicode_AS_UNICODE(unicode);
2470 if (unicode_decode_call_errorhandler(
2471 errors, &errorHandler,
2472 "utf16", errmsg,
2473 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2474 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002475 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002476 }
2477
2478 if (byteorder)
2479 *byteorder = bo;
2480
Walter Dörwald69652032004-09-07 20:24:22 +00002481 if (consumed)
2482 *consumed = (const char *)q-starts;
2483
Guido van Rossumd57fd912000-03-10 22:53:23 +00002484 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002485 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002486 goto onError;
2487
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002488 Py_XDECREF(errorHandler);
2489 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002490 return (PyObject *)unicode;
2491
2492onError:
2493 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002494 Py_XDECREF(errorHandler);
2495 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002496 return NULL;
2497}
2498
Tim Peters772747b2001-08-09 22:21:55 +00002499PyObject *
2500PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002501 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002502 const char *errors,
2503 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002504{
2505 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002506 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002507#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00002508 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002509#else
2510 const int pairs = 0;
2511#endif
Tim Peters772747b2001-08-09 22:21:55 +00002512 /* Offsets from p for storing byte pairs in the right order. */
2513#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2514 int ihi = 1, ilo = 0;
2515#else
2516 int ihi = 0, ilo = 1;
2517#endif
2518
2519#define STORECHAR(CH) \
2520 do { \
2521 p[ihi] = ((CH) >> 8) & 0xff; \
2522 p[ilo] = (CH) & 0xff; \
2523 p += 2; \
2524 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002525
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002526#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002527 for (i = pairs = 0; i < size; i++)
2528 if (s[i] >= 0x10000)
2529 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002530#endif
Tim Petersced69f82003-09-16 20:30:58 +00002531 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002532 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002533 if (v == NULL)
2534 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002535
Tim Peters772747b2001-08-09 22:21:55 +00002536 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002537 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002538 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002539 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002540 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002541
2542 if (byteorder == -1) {
2543 /* force LE */
2544 ihi = 1;
2545 ilo = 0;
2546 }
2547 else if (byteorder == 1) {
2548 /* force BE */
2549 ihi = 0;
2550 ilo = 1;
2551 }
2552
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002553 while (size-- > 0) {
2554 Py_UNICODE ch = *s++;
2555 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002556#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002557 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002558 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2559 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002560 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002561#endif
Tim Peters772747b2001-08-09 22:21:55 +00002562 STORECHAR(ch);
2563 if (ch2)
2564 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002565 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002566 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002567#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002568}
2569
2570PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2571{
2572 if (!PyUnicode_Check(unicode)) {
2573 PyErr_BadArgument();
2574 return NULL;
2575 }
2576 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2577 PyUnicode_GET_SIZE(unicode),
2578 NULL,
2579 0);
2580}
2581
2582/* --- Unicode Escape Codec ----------------------------------------------- */
2583
Fredrik Lundh06d12682001-01-24 07:59:11 +00002584static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002585
Guido van Rossumd57fd912000-03-10 22:53:23 +00002586PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002587 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002588 const char *errors)
2589{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002590 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002591 Py_ssize_t startinpos;
2592 Py_ssize_t endinpos;
2593 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002594 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002595 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002596 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002597 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002598 char* message;
2599 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002600 PyObject *errorHandler = NULL;
2601 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002602
Guido van Rossumd57fd912000-03-10 22:53:23 +00002603 /* Escaped strings will always be longer than the resulting
2604 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002605 length after conversion to the true value.
2606 (but if the error callback returns a long replacement string
2607 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002608 v = _PyUnicode_New(size);
2609 if (v == NULL)
2610 goto onError;
2611 if (size == 0)
2612 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002613
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002614 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002615 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002616
Guido van Rossumd57fd912000-03-10 22:53:23 +00002617 while (s < end) {
2618 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002619 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002620 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002621
2622 /* Non-escape characters are interpreted as Unicode ordinals */
2623 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002624 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002625 continue;
2626 }
2627
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002628 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002629 /* \ - Escapes */
2630 s++;
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002631 c = *s++;
2632 if (s > end)
2633 c = '\0'; /* Invalid after \ */
2634 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002635
2636 /* \x escapes */
2637 case '\n': break;
2638 case '\\': *p++ = '\\'; break;
2639 case '\'': *p++ = '\''; break;
2640 case '\"': *p++ = '\"'; break;
2641 case 'b': *p++ = '\b'; break;
2642 case 'f': *p++ = '\014'; break; /* FF */
2643 case 't': *p++ = '\t'; break;
2644 case 'n': *p++ = '\n'; break;
2645 case 'r': *p++ = '\r'; break;
2646 case 'v': *p++ = '\013'; break; /* VT */
2647 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2648
2649 /* \OOO (octal) escapes */
2650 case '0': case '1': case '2': case '3':
2651 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002652 x = s[-1] - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002653 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002654 x = (x<<3) + *s++ - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002655 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002656 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002657 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002658 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002659 break;
2660
Fredrik Lundhccc74732001-02-18 22:13:49 +00002661 /* hex escapes */
2662 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002663 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002664 digits = 2;
2665 message = "truncated \\xXX escape";
2666 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002667
Fredrik Lundhccc74732001-02-18 22:13:49 +00002668 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002669 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002670 digits = 4;
2671 message = "truncated \\uXXXX escape";
2672 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002673
Fredrik Lundhccc74732001-02-18 22:13:49 +00002674 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002675 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002676 digits = 8;
2677 message = "truncated \\UXXXXXXXX escape";
2678 hexescape:
2679 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002680 outpos = p-PyUnicode_AS_UNICODE(v);
2681 if (s+digits>end) {
2682 endinpos = size;
2683 if (unicode_decode_call_errorhandler(
2684 errors, &errorHandler,
2685 "unicodeescape", "end of string in escape sequence",
2686 starts, size, &startinpos, &endinpos, &exc, &s,
2687 (PyObject **)&v, &outpos, &p))
2688 goto onError;
2689 goto nextByte;
2690 }
2691 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002692 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002693 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002694 endinpos = (s+i+1)-starts;
2695 if (unicode_decode_call_errorhandler(
2696 errors, &errorHandler,
2697 "unicodeescape", message,
2698 starts, size, &startinpos, &endinpos, &exc, &s,
2699 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002700 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002701 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002702 }
2703 chr = (chr<<4) & ~0xF;
2704 if (c >= '0' && c <= '9')
2705 chr += c - '0';
2706 else if (c >= 'a' && c <= 'f')
2707 chr += 10 + c - 'a';
2708 else
2709 chr += 10 + c - 'A';
2710 }
2711 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002712 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002713 /* _decoding_error will have already written into the
2714 target buffer. */
2715 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002716 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002717 /* when we get here, chr is a 32-bit unicode character */
2718 if (chr <= 0xffff)
2719 /* UCS-2 character */
2720 *p++ = (Py_UNICODE) chr;
2721 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002722 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002723 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002724#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002725 *p++ = chr;
2726#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002727 chr -= 0x10000L;
2728 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002729 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002730#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002731 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002732 endinpos = s-starts;
2733 outpos = p-PyUnicode_AS_UNICODE(v);
2734 if (unicode_decode_call_errorhandler(
2735 errors, &errorHandler,
2736 "unicodeescape", "illegal Unicode character",
2737 starts, size, &startinpos, &endinpos, &exc, &s,
2738 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002739 goto onError;
2740 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002741 break;
2742
2743 /* \N{name} */
2744 case 'N':
2745 message = "malformed \\N character escape";
2746 if (ucnhash_CAPI == NULL) {
2747 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002748 PyObject *m, *api;
Christian Heimes000a0742008-01-03 22:16:32 +00002749 m = PyImport_ImportModuleNoBlock("unicodedata");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002750 if (m == NULL)
2751 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002752 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002753 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002754 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002755 goto ucnhashError;
Anthony Baxtera6286212006-04-11 07:42:36 +00002756 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002757 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002758 if (ucnhash_CAPI == NULL)
2759 goto ucnhashError;
2760 }
2761 if (*s == '{') {
2762 const char *start = s+1;
2763 /* look for the closing brace */
2764 while (*s != '}' && s < end)
2765 s++;
2766 if (s > start && s < end && *s == '}') {
2767 /* found a name. look it up in the unicode database */
2768 message = "unknown Unicode character name";
2769 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002770 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002771 goto store;
2772 }
2773 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002774 endinpos = s-starts;
2775 outpos = p-PyUnicode_AS_UNICODE(v);
2776 if (unicode_decode_call_errorhandler(
2777 errors, &errorHandler,
2778 "unicodeescape", message,
2779 starts, size, &startinpos, &endinpos, &exc, &s,
2780 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002781 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002782 break;
2783
2784 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002785 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002786 message = "\\ at end of string";
2787 s--;
2788 endinpos = s-starts;
2789 outpos = p-PyUnicode_AS_UNICODE(v);
2790 if (unicode_decode_call_errorhandler(
2791 errors, &errorHandler,
2792 "unicodeescape", message,
2793 starts, size, &startinpos, &endinpos, &exc, &s,
2794 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002795 goto onError;
2796 }
2797 else {
2798 *p++ = '\\';
2799 *p++ = (unsigned char)s[-1];
2800 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002801 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002802 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002803 nextByte:
2804 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002805 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002806 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002807 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002808 Py_XDECREF(errorHandler);
2809 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002810 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002811
Fredrik Lundhccc74732001-02-18 22:13:49 +00002812ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002813 PyErr_SetString(
2814 PyExc_UnicodeError,
2815 "\\N escapes not supported (can't load unicodedata module)"
2816 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002817 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002818 Py_XDECREF(errorHandler);
2819 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002820 return NULL;
2821
Fredrik Lundhccc74732001-02-18 22:13:49 +00002822onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002823 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002824 Py_XDECREF(errorHandler);
2825 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002826 return NULL;
2827}
2828
2829/* Return a Unicode-Escape string version of the Unicode object.
2830
2831 If quotes is true, the string is enclosed in u"" or u'' quotes as
2832 appropriate.
2833
2834*/
2835
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00002836Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Fredrik Lundh95e2a912006-05-26 11:38:15 +00002837 Py_ssize_t size,
2838 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00002839{
2840 /* like wcschr, but doesn't stop at NULL characters */
2841
2842 while (size-- > 0) {
2843 if (*s == ch)
2844 return s;
2845 s++;
2846 }
2847
2848 return NULL;
2849}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002850
Guido van Rossumd57fd912000-03-10 22:53:23 +00002851static
2852PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002853 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002854 int quotes)
2855{
2856 PyObject *repr;
2857 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002858
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002859 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002860
Neal Norwitz17753ec2006-08-21 22:21:19 +00002861 /* XXX(nnorwitz): rather than over-allocating, it would be
2862 better to choose a different scheme. Perhaps scan the
2863 first N-chars of the string and allocate based on that size.
2864 */
2865 /* Initial allocation is based on the longest-possible unichr
2866 escape.
2867
2868 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2869 unichr, so in this case it's the longest unichr escape. In
2870 narrow (UTF-16) builds this is five chars per source unichr
2871 since there are two unichrs in the surrogate pair, so in narrow
2872 (UTF-16) builds it's not the longest unichr escape.
2873
2874 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2875 so in the narrow (UTF-16) build case it's the longest unichr
2876 escape.
2877 */
2878
2879 repr = PyString_FromStringAndSize(NULL,
2880 2
2881#ifdef Py_UNICODE_WIDE
2882 + 10*size
2883#else
2884 + 6*size
2885#endif
2886 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002887 if (repr == NULL)
2888 return NULL;
2889
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002890 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002891
2892 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002893 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00002894 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00002895 !findchar(s, size, '"')) ? '"' : '\'';
2896 }
2897 while (size-- > 0) {
2898 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002899
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002900 /* Escape quotes and backslashes */
2901 if ((quotes &&
2902 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002903 *p++ = '\\';
2904 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00002905 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002906 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002907
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002908#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002909 /* Map 21-bit characters to '\U00xxxxxx' */
2910 else if (ch >= 0x10000) {
2911 *p++ = '\\';
2912 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002913 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2914 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2915 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2916 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2917 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2918 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2919 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002920 *p++ = hexdigit[ch & 0x0000000F];
2921 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002922 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00002923#else
2924 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002925 else if (ch >= 0xD800 && ch < 0xDC00) {
2926 Py_UNICODE ch2;
2927 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002928
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002929 ch2 = *s++;
2930 size--;
2931 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2932 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2933 *p++ = '\\';
2934 *p++ = 'U';
2935 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2936 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2937 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2938 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2939 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2940 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2941 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2942 *p++ = hexdigit[ucs & 0x0000000F];
2943 continue;
2944 }
2945 /* Fall through: isolated surrogates are copied as-is */
2946 s--;
2947 size++;
2948 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00002949#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002950
Guido van Rossumd57fd912000-03-10 22:53:23 +00002951 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002952 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002953 *p++ = '\\';
2954 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002955 *p++ = hexdigit[(ch >> 12) & 0x000F];
2956 *p++ = hexdigit[(ch >> 8) & 0x000F];
2957 *p++ = hexdigit[(ch >> 4) & 0x000F];
2958 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002959 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002960
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002961 /* Map special whitespace to '\t', \n', '\r' */
2962 else if (ch == '\t') {
2963 *p++ = '\\';
2964 *p++ = 't';
2965 }
2966 else if (ch == '\n') {
2967 *p++ = '\\';
2968 *p++ = 'n';
2969 }
2970 else if (ch == '\r') {
2971 *p++ = '\\';
2972 *p++ = 'r';
2973 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002974
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002975 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002976 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002977 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002978 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002979 *p++ = hexdigit[(ch >> 4) & 0x000F];
2980 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002981 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002982
Guido van Rossumd57fd912000-03-10 22:53:23 +00002983 /* Copy everything else as-is */
2984 else
2985 *p++ = (char) ch;
2986 }
2987 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002988 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002989
2990 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002991 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002992 return repr;
2993}
2994
2995PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002996 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002997{
2998 return unicodeescape_string(s, size, 0);
2999}
3000
3001PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3002{
3003 if (!PyUnicode_Check(unicode)) {
3004 PyErr_BadArgument();
3005 return NULL;
3006 }
3007 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3008 PyUnicode_GET_SIZE(unicode));
3009}
3010
3011/* --- Raw Unicode Escape Codec ------------------------------------------- */
3012
3013PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003014 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003015 const char *errors)
3016{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003017 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003018 Py_ssize_t startinpos;
3019 Py_ssize_t endinpos;
3020 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003021 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003022 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003023 const char *end;
3024 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003025 PyObject *errorHandler = NULL;
3026 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003027
Guido van Rossumd57fd912000-03-10 22:53:23 +00003028 /* Escaped strings will always be longer than the resulting
3029 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003030 length after conversion to the true value. (But decoding error
3031 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003032 v = _PyUnicode_New(size);
3033 if (v == NULL)
3034 goto onError;
3035 if (size == 0)
3036 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003037 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003038 end = s + size;
3039 while (s < end) {
3040 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00003041 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003042 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003043 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003044
3045 /* Non-escape characters are interpreted as Unicode ordinals */
3046 if (*s != '\\') {
3047 *p++ = (unsigned char)*s++;
3048 continue;
3049 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003050 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003051
3052 /* \u-escapes are only interpreted iff the number of leading
3053 backslashes if odd */
3054 bs = s;
3055 for (;s < end;) {
3056 if (*s != '\\')
3057 break;
3058 *p++ = (unsigned char)*s++;
3059 }
3060 if (((s - bs) & 1) == 0 ||
3061 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003062 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003063 continue;
3064 }
3065 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003066 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003067 s++;
3068
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003069 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003070 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003071 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003072 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003073 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003074 endinpos = s-starts;
3075 if (unicode_decode_call_errorhandler(
3076 errors, &errorHandler,
3077 "rawunicodeescape", "truncated \\uXXXX",
3078 starts, size, &startinpos, &endinpos, &exc, &s,
3079 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003080 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003081 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003082 }
3083 x = (x<<4) & ~0xF;
3084 if (c >= '0' && c <= '9')
3085 x += c - '0';
3086 else if (c >= 'a' && c <= 'f')
3087 x += 10 + c - 'a';
3088 else
3089 x += 10 + c - 'A';
3090 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003091#ifndef Py_UNICODE_WIDE
3092 if (x > 0x10000) {
3093 if (unicode_decode_call_errorhandler(
3094 errors, &errorHandler,
3095 "rawunicodeescape", "\\Uxxxxxxxx out of range",
3096 starts, size, &startinpos, &endinpos, &exc, &s,
3097 (PyObject **)&v, &outpos, &p))
3098 goto onError;
3099 }
3100#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003101 *p++ = x;
3102 nextByte:
3103 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003104 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00003105 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003106 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003107 Py_XDECREF(errorHandler);
3108 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003109 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003110
Guido van Rossumd57fd912000-03-10 22:53:23 +00003111 onError:
3112 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003113 Py_XDECREF(errorHandler);
3114 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003115 return NULL;
3116}
3117
3118PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003119 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003120{
3121 PyObject *repr;
3122 char *p;
3123 char *q;
3124
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003125 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003126
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003127#ifdef Py_UNICODE_WIDE
3128 repr = PyString_FromStringAndSize(NULL, 10 * size);
3129#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003130 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003131#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003132 if (repr == NULL)
3133 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003134 if (size == 0)
3135 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003136
3137 p = q = PyString_AS_STRING(repr);
3138 while (size-- > 0) {
3139 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003140#ifdef Py_UNICODE_WIDE
3141 /* Map 32-bit characters to '\Uxxxxxxxx' */
3142 if (ch >= 0x10000) {
3143 *p++ = '\\';
3144 *p++ = 'U';
3145 *p++ = hexdigit[(ch >> 28) & 0xf];
3146 *p++ = hexdigit[(ch >> 24) & 0xf];
3147 *p++ = hexdigit[(ch >> 20) & 0xf];
3148 *p++ = hexdigit[(ch >> 16) & 0xf];
3149 *p++ = hexdigit[(ch >> 12) & 0xf];
3150 *p++ = hexdigit[(ch >> 8) & 0xf];
3151 *p++ = hexdigit[(ch >> 4) & 0xf];
3152 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003153 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003154 else
3155#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003156 /* Map 16-bit characters to '\uxxxx' */
3157 if (ch >= 256) {
3158 *p++ = '\\';
3159 *p++ = 'u';
3160 *p++ = hexdigit[(ch >> 12) & 0xf];
3161 *p++ = hexdigit[(ch >> 8) & 0xf];
3162 *p++ = hexdigit[(ch >> 4) & 0xf];
3163 *p++ = hexdigit[ch & 15];
3164 }
3165 /* Copy everything else as-is */
3166 else
3167 *p++ = (char) ch;
3168 }
3169 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00003170 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003171 return repr;
3172}
3173
3174PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3175{
3176 if (!PyUnicode_Check(unicode)) {
3177 PyErr_BadArgument();
3178 return NULL;
3179 }
3180 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3181 PyUnicode_GET_SIZE(unicode));
3182}
3183
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003184/* --- Unicode Internal Codec ------------------------------------------- */
3185
3186PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003187 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003188 const char *errors)
3189{
3190 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003191 Py_ssize_t startinpos;
3192 Py_ssize_t endinpos;
3193 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003194 PyUnicodeObject *v;
3195 Py_UNICODE *p;
3196 const char *end;
3197 const char *reason;
3198 PyObject *errorHandler = NULL;
3199 PyObject *exc = NULL;
3200
Neal Norwitzd43069c2006-01-08 01:12:10 +00003201#ifdef Py_UNICODE_WIDE
3202 Py_UNICODE unimax = PyUnicode_GetMax();
3203#endif
3204
Armin Rigo7ccbca92006-10-04 12:17:45 +00003205 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003206 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3207 if (v == NULL)
3208 goto onError;
3209 if (PyUnicode_GetSize((PyObject *)v) == 0)
3210 return (PyObject *)v;
3211 p = PyUnicode_AS_UNICODE(v);
3212 end = s + size;
3213
3214 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00003215 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003216 /* We have to sanity check the raw data, otherwise doom looms for
3217 some malformed UCS-4 data. */
3218 if (
3219 #ifdef Py_UNICODE_WIDE
3220 *p > unimax || *p < 0 ||
3221 #endif
3222 end-s < Py_UNICODE_SIZE
3223 )
3224 {
3225 startinpos = s - starts;
3226 if (end-s < Py_UNICODE_SIZE) {
3227 endinpos = end-starts;
3228 reason = "truncated input";
3229 }
3230 else {
3231 endinpos = s - starts + Py_UNICODE_SIZE;
3232 reason = "illegal code point (> 0x10FFFF)";
3233 }
3234 outpos = p - PyUnicode_AS_UNICODE(v);
3235 if (unicode_decode_call_errorhandler(
3236 errors, &errorHandler,
3237 "unicode_internal", reason,
3238 starts, size, &startinpos, &endinpos, &exc, &s,
3239 (PyObject **)&v, &outpos, &p)) {
3240 goto onError;
3241 }
3242 }
3243 else {
3244 p++;
3245 s += Py_UNICODE_SIZE;
3246 }
3247 }
3248
Martin v. Löwis412fb672006-04-13 06:34:32 +00003249 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003250 goto onError;
3251 Py_XDECREF(errorHandler);
3252 Py_XDECREF(exc);
3253 return (PyObject *)v;
3254
3255 onError:
3256 Py_XDECREF(v);
3257 Py_XDECREF(errorHandler);
3258 Py_XDECREF(exc);
3259 return NULL;
3260}
3261
Guido van Rossumd57fd912000-03-10 22:53:23 +00003262/* --- Latin-1 Codec ------------------------------------------------------ */
3263
3264PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003265 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003266 const char *errors)
3267{
3268 PyUnicodeObject *v;
3269 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003270
Guido van Rossumd57fd912000-03-10 22:53:23 +00003271 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003272 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003273 Py_UNICODE r = *(unsigned char*)s;
3274 return PyUnicode_FromUnicode(&r, 1);
3275 }
3276
Guido van Rossumd57fd912000-03-10 22:53:23 +00003277 v = _PyUnicode_New(size);
3278 if (v == NULL)
3279 goto onError;
3280 if (size == 0)
3281 return (PyObject *)v;
3282 p = PyUnicode_AS_UNICODE(v);
3283 while (size-- > 0)
3284 *p++ = (unsigned char)*s++;
3285 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003286
Guido van Rossumd57fd912000-03-10 22:53:23 +00003287 onError:
3288 Py_XDECREF(v);
3289 return NULL;
3290}
3291
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003292/* create or adjust a UnicodeEncodeError */
3293static void make_encode_exception(PyObject **exceptionObject,
3294 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003295 const Py_UNICODE *unicode, Py_ssize_t size,
3296 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003297 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003298{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003299 if (*exceptionObject == NULL) {
3300 *exceptionObject = PyUnicodeEncodeError_Create(
3301 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003302 }
3303 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003304 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3305 goto onError;
3306 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3307 goto onError;
3308 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3309 goto onError;
3310 return;
3311 onError:
3312 Py_DECREF(*exceptionObject);
3313 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003314 }
3315}
3316
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003317/* raises a UnicodeEncodeError */
3318static void raise_encode_exception(PyObject **exceptionObject,
3319 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003320 const Py_UNICODE *unicode, Py_ssize_t size,
3321 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003322 const char *reason)
3323{
3324 make_encode_exception(exceptionObject,
3325 encoding, unicode, size, startpos, endpos, reason);
3326 if (*exceptionObject != NULL)
3327 PyCodec_StrictErrors(*exceptionObject);
3328}
3329
3330/* error handling callback helper:
3331 build arguments, call the callback and check the arguments,
3332 put the result into newpos and return the replacement string, which
3333 has to be freed by the caller */
3334static PyObject *unicode_encode_call_errorhandler(const char *errors,
3335 PyObject **errorHandler,
3336 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003337 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3338 Py_ssize_t startpos, Py_ssize_t endpos,
3339 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003340{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003341 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003342
3343 PyObject *restuple;
3344 PyObject *resunicode;
3345
3346 if (*errorHandler == NULL) {
3347 *errorHandler = PyCodec_LookupError(errors);
3348 if (*errorHandler == NULL)
3349 return NULL;
3350 }
3351
3352 make_encode_exception(exceptionObject,
3353 encoding, unicode, size, startpos, endpos, reason);
3354 if (*exceptionObject == NULL)
3355 return NULL;
3356
3357 restuple = PyObject_CallFunctionObjArgs(
3358 *errorHandler, *exceptionObject, NULL);
3359 if (restuple == NULL)
3360 return NULL;
3361 if (!PyTuple_Check(restuple)) {
3362 PyErr_Format(PyExc_TypeError, &argparse[4]);
3363 Py_DECREF(restuple);
3364 return NULL;
3365 }
3366 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3367 &resunicode, newpos)) {
3368 Py_DECREF(restuple);
3369 return NULL;
3370 }
3371 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003372 *newpos = size+*newpos;
3373 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003374 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003375 Py_DECREF(restuple);
3376 return NULL;
3377 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003378 Py_INCREF(resunicode);
3379 Py_DECREF(restuple);
3380 return resunicode;
3381}
3382
3383static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003384 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003385 const char *errors,
3386 int limit)
3387{
3388 /* output object */
3389 PyObject *res;
3390 /* pointers to the beginning and end+1 of input */
3391 const Py_UNICODE *startp = p;
3392 const Py_UNICODE *endp = p + size;
3393 /* pointer to the beginning of the unencodable characters */
3394 /* const Py_UNICODE *badp = NULL; */
3395 /* pointer into the output */
3396 char *str;
3397 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003398 Py_ssize_t respos = 0;
3399 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00003400 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3401 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003402 PyObject *errorHandler = NULL;
3403 PyObject *exc = NULL;
3404 /* the following variable is used for caching string comparisons
3405 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3406 int known_errorHandler = -1;
3407
3408 /* allocate enough for a simple encoding without
3409 replacements, if we need more, we'll resize */
3410 res = PyString_FromStringAndSize(NULL, size);
3411 if (res == NULL)
3412 goto onError;
3413 if (size == 0)
3414 return res;
3415 str = PyString_AS_STRING(res);
3416 ressize = size;
3417
3418 while (p<endp) {
3419 Py_UNICODE c = *p;
3420
3421 /* can we encode this? */
3422 if (c<limit) {
3423 /* no overflow check, because we know that the space is enough */
3424 *str++ = (char)c;
3425 ++p;
3426 }
3427 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003428 Py_ssize_t unicodepos = p-startp;
3429 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003430 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003431 Py_ssize_t repsize;
3432 Py_ssize_t newpos;
3433 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003434 Py_UNICODE *uni2;
3435 /* startpos for collecting unencodable chars */
3436 const Py_UNICODE *collstart = p;
3437 const Py_UNICODE *collend = p;
3438 /* find all unecodable characters */
3439 while ((collend < endp) && ((*collend)>=limit))
3440 ++collend;
3441 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3442 if (known_errorHandler==-1) {
3443 if ((errors==NULL) || (!strcmp(errors, "strict")))
3444 known_errorHandler = 1;
3445 else if (!strcmp(errors, "replace"))
3446 known_errorHandler = 2;
3447 else if (!strcmp(errors, "ignore"))
3448 known_errorHandler = 3;
3449 else if (!strcmp(errors, "xmlcharrefreplace"))
3450 known_errorHandler = 4;
3451 else
3452 known_errorHandler = 0;
3453 }
3454 switch (known_errorHandler) {
3455 case 1: /* strict */
3456 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3457 goto onError;
3458 case 2: /* replace */
3459 while (collstart++<collend)
3460 *str++ = '?'; /* fall through */
3461 case 3: /* ignore */
3462 p = collend;
3463 break;
3464 case 4: /* xmlcharrefreplace */
3465 respos = str-PyString_AS_STRING(res);
3466 /* determine replacement size (temporarily (mis)uses p) */
3467 for (p = collstart, repsize = 0; p < collend; ++p) {
3468 if (*p<10)
3469 repsize += 2+1+1;
3470 else if (*p<100)
3471 repsize += 2+2+1;
3472 else if (*p<1000)
3473 repsize += 2+3+1;
3474 else if (*p<10000)
3475 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003476#ifndef Py_UNICODE_WIDE
3477 else
3478 repsize += 2+5+1;
3479#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003480 else if (*p<100000)
3481 repsize += 2+5+1;
3482 else if (*p<1000000)
3483 repsize += 2+6+1;
3484 else
3485 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003486#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003487 }
3488 requiredsize = respos+repsize+(endp-collend);
3489 if (requiredsize > ressize) {
3490 if (requiredsize<2*ressize)
3491 requiredsize = 2*ressize;
3492 if (_PyString_Resize(&res, requiredsize))
3493 goto onError;
3494 str = PyString_AS_STRING(res) + respos;
3495 ressize = requiredsize;
3496 }
3497 /* generate replacement (temporarily (mis)uses p) */
3498 for (p = collstart; p < collend; ++p) {
3499 str += sprintf(str, "&#%d;", (int)*p);
3500 }
3501 p = collend;
3502 break;
3503 default:
3504 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3505 encoding, reason, startp, size, &exc,
3506 collstart-startp, collend-startp, &newpos);
3507 if (repunicode == NULL)
3508 goto onError;
3509 /* need more space? (at least enough for what we
3510 have+the replacement+the rest of the string, so
3511 we won't have to check space for encodable characters) */
3512 respos = str-PyString_AS_STRING(res);
3513 repsize = PyUnicode_GET_SIZE(repunicode);
3514 requiredsize = respos+repsize+(endp-collend);
3515 if (requiredsize > ressize) {
3516 if (requiredsize<2*ressize)
3517 requiredsize = 2*ressize;
3518 if (_PyString_Resize(&res, requiredsize)) {
3519 Py_DECREF(repunicode);
3520 goto onError;
3521 }
3522 str = PyString_AS_STRING(res) + respos;
3523 ressize = requiredsize;
3524 }
3525 /* check if there is anything unencodable in the replacement
3526 and copy it to the output */
3527 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3528 c = *uni2;
3529 if (c >= limit) {
3530 raise_encode_exception(&exc, encoding, startp, size,
3531 unicodepos, unicodepos+1, reason);
3532 Py_DECREF(repunicode);
3533 goto onError;
3534 }
3535 *str = (char)c;
3536 }
3537 p = startp + newpos;
3538 Py_DECREF(repunicode);
3539 }
3540 }
3541 }
3542 /* Resize if we allocated to much */
3543 respos = str-PyString_AS_STRING(res);
3544 if (respos<ressize)
3545 /* If this falls res will be NULL */
3546 _PyString_Resize(&res, respos);
3547 Py_XDECREF(errorHandler);
3548 Py_XDECREF(exc);
3549 return res;
3550
3551 onError:
3552 Py_XDECREF(res);
3553 Py_XDECREF(errorHandler);
3554 Py_XDECREF(exc);
3555 return NULL;
3556}
3557
Guido van Rossumd57fd912000-03-10 22:53:23 +00003558PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003559 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003560 const char *errors)
3561{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003562 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003563}
3564
3565PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3566{
3567 if (!PyUnicode_Check(unicode)) {
3568 PyErr_BadArgument();
3569 return NULL;
3570 }
3571 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3572 PyUnicode_GET_SIZE(unicode),
3573 NULL);
3574}
3575
3576/* --- 7-bit ASCII Codec -------------------------------------------------- */
3577
Guido van Rossumd57fd912000-03-10 22:53:23 +00003578PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003579 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003580 const char *errors)
3581{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003582 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003583 PyUnicodeObject *v;
3584 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003585 Py_ssize_t startinpos;
3586 Py_ssize_t endinpos;
3587 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003588 const char *e;
3589 PyObject *errorHandler = NULL;
3590 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003591
Guido van Rossumd57fd912000-03-10 22:53:23 +00003592 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003593 if (size == 1 && *(unsigned char*)s < 128) {
3594 Py_UNICODE r = *(unsigned char*)s;
3595 return PyUnicode_FromUnicode(&r, 1);
3596 }
Tim Petersced69f82003-09-16 20:30:58 +00003597
Guido van Rossumd57fd912000-03-10 22:53:23 +00003598 v = _PyUnicode_New(size);
3599 if (v == NULL)
3600 goto onError;
3601 if (size == 0)
3602 return (PyObject *)v;
3603 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003604 e = s + size;
3605 while (s < e) {
3606 register unsigned char c = (unsigned char)*s;
3607 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003608 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003609 ++s;
3610 }
3611 else {
3612 startinpos = s-starts;
3613 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003614 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003615 if (unicode_decode_call_errorhandler(
3616 errors, &errorHandler,
3617 "ascii", "ordinal not in range(128)",
3618 starts, size, &startinpos, &endinpos, &exc, &s,
3619 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003620 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003621 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003622 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003623 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00003624 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003625 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003626 Py_XDECREF(errorHandler);
3627 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003628 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003629
Guido van Rossumd57fd912000-03-10 22:53:23 +00003630 onError:
3631 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003632 Py_XDECREF(errorHandler);
3633 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003634 return NULL;
3635}
3636
Guido van Rossumd57fd912000-03-10 22:53:23 +00003637PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003638 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003639 const char *errors)
3640{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003641 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003642}
3643
3644PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3645{
3646 if (!PyUnicode_Check(unicode)) {
3647 PyErr_BadArgument();
3648 return NULL;
3649 }
3650 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3651 PyUnicode_GET_SIZE(unicode),
3652 NULL);
3653}
3654
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003655#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003656
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003657/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003658
Martin v. Löwisd8251432006-06-14 05:21:04 +00003659#if SIZEOF_INT < SIZEOF_SSIZE_T
3660#define NEED_RETRY
3661#endif
3662
3663/* XXX This code is limited to "true" double-byte encodings, as
3664 a) it assumes an incomplete character consists of a single byte, and
3665 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3666 encodings, see IsDBCSLeadByteEx documentation. */
3667
3668static int is_dbcs_lead_byte(const char *s, int offset)
3669{
3670 const char *curr = s + offset;
3671
3672 if (IsDBCSLeadByte(*curr)) {
3673 const char *prev = CharPrev(s, curr);
3674 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3675 }
3676 return 0;
3677}
3678
3679/*
3680 * Decode MBCS string into unicode object. If 'final' is set, converts
3681 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3682 */
3683static int decode_mbcs(PyUnicodeObject **v,
3684 const char *s, /* MBCS string */
3685 int size, /* sizeof MBCS string */
3686 int final)
3687{
3688 Py_UNICODE *p;
3689 Py_ssize_t n = 0;
3690 int usize = 0;
3691
3692 assert(size >= 0);
3693
3694 /* Skip trailing lead-byte unless 'final' is set */
3695 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3696 --size;
3697
3698 /* First get the size of the result */
3699 if (size > 0) {
3700 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3701 if (usize == 0) {
3702 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3703 return -1;
3704 }
3705 }
3706
3707 if (*v == NULL) {
3708 /* Create unicode object */
3709 *v = _PyUnicode_New(usize);
3710 if (*v == NULL)
3711 return -1;
3712 }
3713 else {
3714 /* Extend unicode object */
3715 n = PyUnicode_GET_SIZE(*v);
3716 if (_PyUnicode_Resize(v, n + usize) < 0)
3717 return -1;
3718 }
3719
3720 /* Do the conversion */
3721 if (size > 0) {
3722 p = PyUnicode_AS_UNICODE(*v) + n;
3723 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3724 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3725 return -1;
3726 }
3727 }
3728
3729 return size;
3730}
3731
3732PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3733 Py_ssize_t size,
3734 const char *errors,
3735 Py_ssize_t *consumed)
3736{
3737 PyUnicodeObject *v = NULL;
3738 int done;
3739
3740 if (consumed)
3741 *consumed = 0;
3742
3743#ifdef NEED_RETRY
3744 retry:
3745 if (size > INT_MAX)
3746 done = decode_mbcs(&v, s, INT_MAX, 0);
3747 else
3748#endif
3749 done = decode_mbcs(&v, s, (int)size, !consumed);
3750
3751 if (done < 0) {
3752 Py_XDECREF(v);
3753 return NULL;
3754 }
3755
3756 if (consumed)
3757 *consumed += done;
3758
3759#ifdef NEED_RETRY
3760 if (size > INT_MAX) {
3761 s += done;
3762 size -= done;
3763 goto retry;
3764 }
3765#endif
3766
3767 return (PyObject *)v;
3768}
3769
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003770PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003771 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003772 const char *errors)
3773{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003774 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3775}
3776
3777/*
3778 * Convert unicode into string object (MBCS).
3779 * Returns 0 if succeed, -1 otherwise.
3780 */
3781static int encode_mbcs(PyObject **repr,
3782 const Py_UNICODE *p, /* unicode */
3783 int size) /* size of unicode */
3784{
3785 int mbcssize = 0;
3786 Py_ssize_t n = 0;
3787
3788 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003789
3790 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003791 if (size > 0) {
3792 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3793 if (mbcssize == 0) {
3794 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3795 return -1;
3796 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003797 }
3798
Martin v. Löwisd8251432006-06-14 05:21:04 +00003799 if (*repr == NULL) {
3800 /* Create string object */
3801 *repr = PyString_FromStringAndSize(NULL, mbcssize);
3802 if (*repr == NULL)
3803 return -1;
3804 }
3805 else {
3806 /* Extend string object */
3807 n = PyString_Size(*repr);
3808 if (_PyString_Resize(repr, n + mbcssize) < 0)
3809 return -1;
3810 }
3811
3812 /* Do the conversion */
3813 if (size > 0) {
3814 char *s = PyString_AS_STRING(*repr) + n;
3815 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3816 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3817 return -1;
3818 }
3819 }
3820
3821 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003822}
3823
3824PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003825 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003826 const char *errors)
3827{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003828 PyObject *repr = NULL;
3829 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003830
Martin v. Löwisd8251432006-06-14 05:21:04 +00003831#ifdef NEED_RETRY
3832 retry:
3833 if (size > INT_MAX)
3834 ret = encode_mbcs(&repr, p, INT_MAX);
3835 else
3836#endif
3837 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003838
Martin v. Löwisd8251432006-06-14 05:21:04 +00003839 if (ret < 0) {
3840 Py_XDECREF(repr);
3841 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003842 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003843
3844#ifdef NEED_RETRY
3845 if (size > INT_MAX) {
3846 p += INT_MAX;
3847 size -= INT_MAX;
3848 goto retry;
3849 }
3850#endif
3851
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003852 return repr;
3853}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003854
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003855PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3856{
3857 if (!PyUnicode_Check(unicode)) {
3858 PyErr_BadArgument();
3859 return NULL;
3860 }
3861 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3862 PyUnicode_GET_SIZE(unicode),
3863 NULL);
3864}
3865
Martin v. Löwisd8251432006-06-14 05:21:04 +00003866#undef NEED_RETRY
3867
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003868#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003869
Guido van Rossumd57fd912000-03-10 22:53:23 +00003870/* --- Character Mapping Codec -------------------------------------------- */
3871
Guido van Rossumd57fd912000-03-10 22:53:23 +00003872PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003873 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003874 PyObject *mapping,
3875 const char *errors)
3876{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003877 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003878 Py_ssize_t startinpos;
3879 Py_ssize_t endinpos;
3880 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003881 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003882 PyUnicodeObject *v;
3883 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003884 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003885 PyObject *errorHandler = NULL;
3886 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003887 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003888 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003889
Guido van Rossumd57fd912000-03-10 22:53:23 +00003890 /* Default to Latin-1 */
3891 if (mapping == NULL)
3892 return PyUnicode_DecodeLatin1(s, size, errors);
3893
3894 v = _PyUnicode_New(size);
3895 if (v == NULL)
3896 goto onError;
3897 if (size == 0)
3898 return (PyObject *)v;
3899 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003900 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003901 if (PyUnicode_CheckExact(mapping)) {
3902 mapstring = PyUnicode_AS_UNICODE(mapping);
3903 maplen = PyUnicode_GET_SIZE(mapping);
3904 while (s < e) {
3905 unsigned char ch = *s;
3906 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003907
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003908 if (ch < maplen)
3909 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003910
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003911 if (x == 0xfffe) {
3912 /* undefined mapping */
3913 outpos = p-PyUnicode_AS_UNICODE(v);
3914 startinpos = s-starts;
3915 endinpos = startinpos+1;
3916 if (unicode_decode_call_errorhandler(
3917 errors, &errorHandler,
3918 "charmap", "character maps to <undefined>",
3919 starts, size, &startinpos, &endinpos, &exc, &s,
3920 (PyObject **)&v, &outpos, &p)) {
3921 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003922 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003923 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003924 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003925 *p++ = x;
3926 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003927 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003928 }
3929 else {
3930 while (s < e) {
3931 unsigned char ch = *s;
3932 PyObject *w, *x;
3933
3934 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3935 w = PyInt_FromLong((long)ch);
3936 if (w == NULL)
3937 goto onError;
3938 x = PyObject_GetItem(mapping, w);
3939 Py_DECREF(w);
3940 if (x == NULL) {
3941 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3942 /* No mapping found means: mapping is undefined. */
3943 PyErr_Clear();
3944 x = Py_None;
3945 Py_INCREF(x);
3946 } else
3947 goto onError;
3948 }
3949
3950 /* Apply mapping */
3951 if (PyInt_Check(x)) {
3952 long value = PyInt_AS_LONG(x);
3953 if (value < 0 || value > 65535) {
3954 PyErr_SetString(PyExc_TypeError,
3955 "character mapping must be in range(65536)");
3956 Py_DECREF(x);
3957 goto onError;
3958 }
3959 *p++ = (Py_UNICODE)value;
3960 }
3961 else if (x == Py_None) {
3962 /* undefined mapping */
3963 outpos = p-PyUnicode_AS_UNICODE(v);
3964 startinpos = s-starts;
3965 endinpos = startinpos+1;
3966 if (unicode_decode_call_errorhandler(
3967 errors, &errorHandler,
3968 "charmap", "character maps to <undefined>",
3969 starts, size, &startinpos, &endinpos, &exc, &s,
3970 (PyObject **)&v, &outpos, &p)) {
3971 Py_DECREF(x);
3972 goto onError;
3973 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003974 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003975 continue;
3976 }
3977 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003978 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003979
3980 if (targetsize == 1)
3981 /* 1-1 mapping */
3982 *p++ = *PyUnicode_AS_UNICODE(x);
3983
3984 else if (targetsize > 1) {
3985 /* 1-n mapping */
3986 if (targetsize > extrachars) {
3987 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003988 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3989 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003990 (targetsize << 2);
3991 extrachars += needed;
Armin Rigo7ccbca92006-10-04 12:17:45 +00003992 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003993 if (_PyUnicode_Resize(&v,
3994 PyUnicode_GET_SIZE(v) + needed) < 0) {
3995 Py_DECREF(x);
3996 goto onError;
3997 }
3998 p = PyUnicode_AS_UNICODE(v) + oldpos;
3999 }
4000 Py_UNICODE_COPY(p,
4001 PyUnicode_AS_UNICODE(x),
4002 targetsize);
4003 p += targetsize;
4004 extrachars -= targetsize;
4005 }
4006 /* 1-0 mapping: skip the character */
4007 }
4008 else {
4009 /* wrong return value */
4010 PyErr_SetString(PyExc_TypeError,
4011 "character mapping must return integer, None or unicode");
4012 Py_DECREF(x);
4013 goto onError;
4014 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004015 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004016 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004017 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004018 }
4019 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00004020 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004021 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004022 Py_XDECREF(errorHandler);
4023 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004024 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004025
Guido van Rossumd57fd912000-03-10 22:53:23 +00004026 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004027 Py_XDECREF(errorHandler);
4028 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004029 Py_XDECREF(v);
4030 return NULL;
4031}
4032
Martin v. Löwis3f767792006-06-04 19:36:28 +00004033/* Charmap encoding: the lookup table */
4034
4035struct encoding_map{
4036 PyObject_HEAD
4037 unsigned char level1[32];
4038 int count2, count3;
4039 unsigned char level23[1];
4040};
4041
4042static PyObject*
4043encoding_map_size(PyObject *obj, PyObject* args)
4044{
4045 struct encoding_map *map = (struct encoding_map*)obj;
4046 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4047 128*map->count3);
4048}
4049
4050static PyMethodDef encoding_map_methods[] = {
4051 {"size", encoding_map_size, METH_NOARGS,
4052 PyDoc_STR("Return the size (in bytes) of this object") },
4053 { 0 }
4054};
4055
4056static void
4057encoding_map_dealloc(PyObject* o)
4058{
4059 PyObject_FREE(o);
4060}
4061
4062static PyTypeObject EncodingMapType = {
Martin v. Löwis68192102007-07-21 06:55:02 +00004063 PyVarObject_HEAD_INIT(NULL, 0)
Martin v. Löwis3f767792006-06-04 19:36:28 +00004064 "EncodingMap", /*tp_name*/
4065 sizeof(struct encoding_map), /*tp_basicsize*/
4066 0, /*tp_itemsize*/
4067 /* methods */
4068 encoding_map_dealloc, /*tp_dealloc*/
4069 0, /*tp_print*/
4070 0, /*tp_getattr*/
4071 0, /*tp_setattr*/
4072 0, /*tp_compare*/
4073 0, /*tp_repr*/
4074 0, /*tp_as_number*/
4075 0, /*tp_as_sequence*/
4076 0, /*tp_as_mapping*/
4077 0, /*tp_hash*/
4078 0, /*tp_call*/
4079 0, /*tp_str*/
4080 0, /*tp_getattro*/
4081 0, /*tp_setattro*/
4082 0, /*tp_as_buffer*/
4083 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4084 0, /*tp_doc*/
4085 0, /*tp_traverse*/
4086 0, /*tp_clear*/
4087 0, /*tp_richcompare*/
4088 0, /*tp_weaklistoffset*/
4089 0, /*tp_iter*/
4090 0, /*tp_iternext*/
4091 encoding_map_methods, /*tp_methods*/
4092 0, /*tp_members*/
4093 0, /*tp_getset*/
4094 0, /*tp_base*/
4095 0, /*tp_dict*/
4096 0, /*tp_descr_get*/
4097 0, /*tp_descr_set*/
4098 0, /*tp_dictoffset*/
4099 0, /*tp_init*/
4100 0, /*tp_alloc*/
4101 0, /*tp_new*/
4102 0, /*tp_free*/
4103 0, /*tp_is_gc*/
4104};
4105
4106PyObject*
4107PyUnicode_BuildEncodingMap(PyObject* string)
4108{
4109 Py_UNICODE *decode;
4110 PyObject *result;
4111 struct encoding_map *mresult;
4112 int i;
4113 int need_dict = 0;
4114 unsigned char level1[32];
4115 unsigned char level2[512];
4116 unsigned char *mlevel1, *mlevel2, *mlevel3;
4117 int count2 = 0, count3 = 0;
4118
4119 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4120 PyErr_BadArgument();
4121 return NULL;
4122 }
4123 decode = PyUnicode_AS_UNICODE(string);
4124 memset(level1, 0xFF, sizeof level1);
4125 memset(level2, 0xFF, sizeof level2);
4126
4127 /* If there isn't a one-to-one mapping of NULL to \0,
4128 or if there are non-BMP characters, we need to use
4129 a mapping dictionary. */
4130 if (decode[0] != 0)
4131 need_dict = 1;
4132 for (i = 1; i < 256; i++) {
4133 int l1, l2;
4134 if (decode[i] == 0
4135 #ifdef Py_UNICODE_WIDE
4136 || decode[i] > 0xFFFF
4137 #endif
4138 ) {
4139 need_dict = 1;
4140 break;
4141 }
4142 if (decode[i] == 0xFFFE)
4143 /* unmapped character */
4144 continue;
4145 l1 = decode[i] >> 11;
4146 l2 = decode[i] >> 7;
4147 if (level1[l1] == 0xFF)
4148 level1[l1] = count2++;
4149 if (level2[l2] == 0xFF)
4150 level2[l2] = count3++;
4151 }
4152
4153 if (count2 >= 0xFF || count3 >= 0xFF)
4154 need_dict = 1;
4155
4156 if (need_dict) {
4157 PyObject *result = PyDict_New();
4158 PyObject *key, *value;
4159 if (!result)
4160 return NULL;
4161 for (i = 0; i < 256; i++) {
4162 key = value = NULL;
4163 key = PyInt_FromLong(decode[i]);
4164 value = PyInt_FromLong(i);
4165 if (!key || !value)
4166 goto failed1;
4167 if (PyDict_SetItem(result, key, value) == -1)
4168 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00004169 Py_DECREF(key);
4170 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004171 }
4172 return result;
4173 failed1:
4174 Py_XDECREF(key);
4175 Py_XDECREF(value);
4176 Py_DECREF(result);
4177 return NULL;
4178 }
4179
4180 /* Create a three-level trie */
4181 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4182 16*count2 + 128*count3 - 1);
4183 if (!result)
4184 return PyErr_NoMemory();
4185 PyObject_Init(result, &EncodingMapType);
4186 mresult = (struct encoding_map*)result;
4187 mresult->count2 = count2;
4188 mresult->count3 = count3;
4189 mlevel1 = mresult->level1;
4190 mlevel2 = mresult->level23;
4191 mlevel3 = mresult->level23 + 16*count2;
4192 memcpy(mlevel1, level1, 32);
4193 memset(mlevel2, 0xFF, 16*count2);
4194 memset(mlevel3, 0, 128*count3);
4195 count3 = 0;
4196 for (i = 1; i < 256; i++) {
4197 int o1, o2, o3, i2, i3;
4198 if (decode[i] == 0xFFFE)
4199 /* unmapped character */
4200 continue;
4201 o1 = decode[i]>>11;
4202 o2 = (decode[i]>>7) & 0xF;
4203 i2 = 16*mlevel1[o1] + o2;
4204 if (mlevel2[i2] == 0xFF)
4205 mlevel2[i2] = count3++;
4206 o3 = decode[i] & 0x7F;
4207 i3 = 128*mlevel2[i2] + o3;
4208 mlevel3[i3] = i;
4209 }
4210 return result;
4211}
4212
4213static int
4214encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4215{
4216 struct encoding_map *map = (struct encoding_map*)mapping;
4217 int l1 = c>>11;
4218 int l2 = (c>>7) & 0xF;
4219 int l3 = c & 0x7F;
4220 int i;
4221
4222#ifdef Py_UNICODE_WIDE
4223 if (c > 0xFFFF) {
4224 return -1;
4225 }
4226#endif
4227 if (c == 0)
4228 return 0;
4229 /* level 1*/
4230 i = map->level1[l1];
4231 if (i == 0xFF) {
4232 return -1;
4233 }
4234 /* level 2*/
4235 i = map->level23[16*i+l2];
4236 if (i == 0xFF) {
4237 return -1;
4238 }
4239 /* level 3 */
4240 i = map->level23[16*map->count2 + 128*i + l3];
4241 if (i == 0) {
4242 return -1;
4243 }
4244 return i;
4245}
4246
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004247/* Lookup the character ch in the mapping. If the character
4248 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004249 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004250static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004251{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004252 PyObject *w = PyInt_FromLong((long)c);
4253 PyObject *x;
4254
4255 if (w == NULL)
4256 return NULL;
4257 x = PyObject_GetItem(mapping, w);
4258 Py_DECREF(w);
4259 if (x == NULL) {
4260 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4261 /* No mapping found means: mapping is undefined. */
4262 PyErr_Clear();
4263 x = Py_None;
4264 Py_INCREF(x);
4265 return x;
4266 } else
4267 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004268 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004269 else if (x == Py_None)
4270 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004271 else if (PyInt_Check(x)) {
4272 long value = PyInt_AS_LONG(x);
4273 if (value < 0 || value > 255) {
4274 PyErr_SetString(PyExc_TypeError,
4275 "character mapping must be in range(256)");
4276 Py_DECREF(x);
4277 return NULL;
4278 }
4279 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004280 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004281 else if (PyString_Check(x))
4282 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004283 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004284 /* wrong return value */
4285 PyErr_SetString(PyExc_TypeError,
4286 "character mapping must return integer, None or str");
4287 Py_DECREF(x);
4288 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004289 }
4290}
4291
Martin v. Löwis3f767792006-06-04 19:36:28 +00004292static int
4293charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4294{
4295 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4296 /* exponentially overallocate to minimize reallocations */
4297 if (requiredsize < 2*outsize)
4298 requiredsize = 2*outsize;
4299 if (_PyString_Resize(outobj, requiredsize)) {
4300 return 0;
4301 }
4302 return 1;
4303}
4304
4305typedef enum charmapencode_result {
4306 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4307}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004308/* lookup the character, put the result in the output string and adjust
4309 various state variables. Reallocate the output string if not enough
4310 space is available. Return a new reference to the object that
4311 was put in the output buffer, or Py_None, if the mapping was undefined
4312 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004313 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004314static
Martin v. Löwis3f767792006-06-04 19:36:28 +00004315charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004316 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004317{
Martin v. Löwis3f767792006-06-04 19:36:28 +00004318 PyObject *rep;
4319 char *outstart;
4320 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004321
Christian Heimese93237d2007-12-19 02:37:44 +00004322 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004323 int res = encoding_map_lookup(c, mapping);
4324 Py_ssize_t requiredsize = *outpos+1;
4325 if (res == -1)
4326 return enc_FAILED;
4327 if (outsize<requiredsize)
4328 if (!charmapencode_resize(outobj, outpos, requiredsize))
4329 return enc_EXCEPTION;
4330 outstart = PyString_AS_STRING(*outobj);
4331 outstart[(*outpos)++] = (char)res;
4332 return enc_SUCCESS;
4333 }
4334
4335 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004336 if (rep==NULL)
Martin v. Löwis3f767792006-06-04 19:36:28 +00004337 return enc_EXCEPTION;
4338 else if (rep==Py_None) {
4339 Py_DECREF(rep);
4340 return enc_FAILED;
4341 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004342 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004343 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004344 if (outsize<requiredsize)
4345 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004346 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004347 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004348 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004349 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004350 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4351 }
4352 else {
4353 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004354 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4355 Py_ssize_t requiredsize = *outpos+repsize;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004356 if (outsize<requiredsize)
4357 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004358 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004359 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004360 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004361 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004362 memcpy(outstart + *outpos, repchars, repsize);
4363 *outpos += repsize;
4364 }
4365 }
Georg Brandl9f167602006-06-04 21:46:16 +00004366 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004367 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004368}
4369
4370/* handle an error in PyUnicode_EncodeCharmap
4371 Return 0 on success, -1 on error */
4372static
4373int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004374 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004375 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004376 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004377 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004378{
4379 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004380 Py_ssize_t repsize;
4381 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004382 Py_UNICODE *uni2;
4383 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004384 Py_ssize_t collstartpos = *inpos;
4385 Py_ssize_t collendpos = *inpos+1;
4386 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004387 char *encoding = "charmap";
4388 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00004389 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004390
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004391 /* find all unencodable characters */
4392 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004393 PyObject *rep;
Christian Heimese93237d2007-12-19 02:37:44 +00004394 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004395 int res = encoding_map_lookup(p[collendpos], mapping);
4396 if (res != -1)
4397 break;
4398 ++collendpos;
4399 continue;
4400 }
4401
4402 rep = charmapencode_lookup(p[collendpos], mapping);
4403 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004404 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004405 else if (rep!=Py_None) {
4406 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004407 break;
4408 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004409 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004410 ++collendpos;
4411 }
4412 /* cache callback name lookup
4413 * (if not done yet, i.e. it's the first error) */
4414 if (*known_errorHandler==-1) {
4415 if ((errors==NULL) || (!strcmp(errors, "strict")))
4416 *known_errorHandler = 1;
4417 else if (!strcmp(errors, "replace"))
4418 *known_errorHandler = 2;
4419 else if (!strcmp(errors, "ignore"))
4420 *known_errorHandler = 3;
4421 else if (!strcmp(errors, "xmlcharrefreplace"))
4422 *known_errorHandler = 4;
4423 else
4424 *known_errorHandler = 0;
4425 }
4426 switch (*known_errorHandler) {
4427 case 1: /* strict */
4428 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4429 return -1;
4430 case 2: /* replace */
4431 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4432 x = charmapencode_output('?', mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004433 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004434 return -1;
4435 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004436 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004437 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4438 return -1;
4439 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004440 }
4441 /* fall through */
4442 case 3: /* ignore */
4443 *inpos = collendpos;
4444 break;
4445 case 4: /* xmlcharrefreplace */
4446 /* generate replacement (temporarily (mis)uses p) */
4447 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4448 char buffer[2+29+1+1];
4449 char *cp;
4450 sprintf(buffer, "&#%d;", (int)p[collpos]);
4451 for (cp = buffer; *cp; ++cp) {
4452 x = charmapencode_output(*cp, mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004453 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004454 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004455 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004456 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4457 return -1;
4458 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004459 }
4460 }
4461 *inpos = collendpos;
4462 break;
4463 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004464 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004465 encoding, reason, p, size, exceptionObject,
4466 collstartpos, collendpos, &newpos);
4467 if (repunicode == NULL)
4468 return -1;
4469 /* generate replacement */
4470 repsize = PyUnicode_GET_SIZE(repunicode);
4471 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4472 x = charmapencode_output(*uni2, mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004473 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004474 return -1;
4475 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004476 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004477 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004478 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4479 return -1;
4480 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004481 }
4482 *inpos = newpos;
4483 Py_DECREF(repunicode);
4484 }
4485 return 0;
4486}
4487
Guido van Rossumd57fd912000-03-10 22:53:23 +00004488PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004489 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004490 PyObject *mapping,
4491 const char *errors)
4492{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004493 /* output object */
4494 PyObject *res = NULL;
4495 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004496 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004497 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004498 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004499 PyObject *errorHandler = NULL;
4500 PyObject *exc = NULL;
4501 /* the following variable is used for caching string comparisons
4502 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4503 * 3=ignore, 4=xmlcharrefreplace */
4504 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004505
4506 /* Default to Latin-1 */
4507 if (mapping == NULL)
4508 return PyUnicode_EncodeLatin1(p, size, errors);
4509
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004510 /* allocate enough for a simple encoding without
4511 replacements, if we need more, we'll resize */
4512 res = PyString_FromStringAndSize(NULL, size);
4513 if (res == NULL)
4514 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004515 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004516 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004517
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004518 while (inpos<size) {
4519 /* try to encode it */
Martin v. Löwis3f767792006-06-04 19:36:28 +00004520 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4521 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004522 goto onError;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004523 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004524 if (charmap_encoding_error(p, size, &inpos, mapping,
4525 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004526 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00004527 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004528 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004529 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004530 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004531 else
4532 /* done with this character => adjust input position */
4533 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004534 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004535
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004536 /* Resize if we allocated to much */
4537 if (respos<PyString_GET_SIZE(res)) {
4538 if (_PyString_Resize(&res, respos))
4539 goto onError;
4540 }
4541 Py_XDECREF(exc);
4542 Py_XDECREF(errorHandler);
4543 return res;
4544
4545 onError:
4546 Py_XDECREF(res);
4547 Py_XDECREF(exc);
4548 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004549 return NULL;
4550}
4551
4552PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4553 PyObject *mapping)
4554{
4555 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4556 PyErr_BadArgument();
4557 return NULL;
4558 }
4559 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4560 PyUnicode_GET_SIZE(unicode),
4561 mapping,
4562 NULL);
4563}
4564
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004565/* create or adjust a UnicodeTranslateError */
4566static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004567 const Py_UNICODE *unicode, Py_ssize_t size,
4568 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004569 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004570{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004571 if (*exceptionObject == NULL) {
4572 *exceptionObject = PyUnicodeTranslateError_Create(
4573 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004574 }
4575 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004576 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4577 goto onError;
4578 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4579 goto onError;
4580 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4581 goto onError;
4582 return;
4583 onError:
4584 Py_DECREF(*exceptionObject);
4585 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004586 }
4587}
4588
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004589/* raises a UnicodeTranslateError */
4590static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004591 const Py_UNICODE *unicode, Py_ssize_t size,
4592 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004593 const char *reason)
4594{
4595 make_translate_exception(exceptionObject,
4596 unicode, size, startpos, endpos, reason);
4597 if (*exceptionObject != NULL)
4598 PyCodec_StrictErrors(*exceptionObject);
4599}
4600
4601/* error handling callback helper:
4602 build arguments, call the callback and check the arguments,
4603 put the result into newpos and return the replacement string, which
4604 has to be freed by the caller */
4605static PyObject *unicode_translate_call_errorhandler(const char *errors,
4606 PyObject **errorHandler,
4607 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004608 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4609 Py_ssize_t startpos, Py_ssize_t endpos,
4610 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004611{
Martin v. Löwis412fb672006-04-13 06:34:32 +00004612 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004613
Martin v. Löwis412fb672006-04-13 06:34:32 +00004614 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004615 PyObject *restuple;
4616 PyObject *resunicode;
4617
4618 if (*errorHandler == NULL) {
4619 *errorHandler = PyCodec_LookupError(errors);
4620 if (*errorHandler == NULL)
4621 return NULL;
4622 }
4623
4624 make_translate_exception(exceptionObject,
4625 unicode, size, startpos, endpos, reason);
4626 if (*exceptionObject == NULL)
4627 return NULL;
4628
4629 restuple = PyObject_CallFunctionObjArgs(
4630 *errorHandler, *exceptionObject, NULL);
4631 if (restuple == NULL)
4632 return NULL;
4633 if (!PyTuple_Check(restuple)) {
4634 PyErr_Format(PyExc_TypeError, &argparse[4]);
4635 Py_DECREF(restuple);
4636 return NULL;
4637 }
4638 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004639 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004640 Py_DECREF(restuple);
4641 return NULL;
4642 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004643 if (i_newpos<0)
4644 *newpos = size+i_newpos;
4645 else
4646 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004647 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004648 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004649 Py_DECREF(restuple);
4650 return NULL;
4651 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004652 Py_INCREF(resunicode);
4653 Py_DECREF(restuple);
4654 return resunicode;
4655}
4656
4657/* Lookup the character ch in the mapping and put the result in result,
4658 which must be decrefed by the caller.
4659 Return 0 on success, -1 on error */
4660static
4661int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4662{
4663 PyObject *w = PyInt_FromLong((long)c);
4664 PyObject *x;
4665
4666 if (w == NULL)
4667 return -1;
4668 x = PyObject_GetItem(mapping, w);
4669 Py_DECREF(w);
4670 if (x == NULL) {
4671 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4672 /* No mapping found means: use 1:1 mapping. */
4673 PyErr_Clear();
4674 *result = NULL;
4675 return 0;
4676 } else
4677 return -1;
4678 }
4679 else if (x == Py_None) {
4680 *result = x;
4681 return 0;
4682 }
4683 else if (PyInt_Check(x)) {
4684 long value = PyInt_AS_LONG(x);
4685 long max = PyUnicode_GetMax();
4686 if (value < 0 || value > max) {
4687 PyErr_Format(PyExc_TypeError,
4688 "character mapping must be in range(0x%lx)", max+1);
4689 Py_DECREF(x);
4690 return -1;
4691 }
4692 *result = x;
4693 return 0;
4694 }
4695 else if (PyUnicode_Check(x)) {
4696 *result = x;
4697 return 0;
4698 }
4699 else {
4700 /* wrong return value */
4701 PyErr_SetString(PyExc_TypeError,
4702 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004703 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004704 return -1;
4705 }
4706}
4707/* ensure that *outobj is at least requiredsize characters long,
4708if not reallocate and adjust various state variables.
4709Return 0 on success, -1 on error */
4710static
Walter Dörwald4894c302003-10-24 14:25:28 +00004711int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004712 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004713{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004714 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004715 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004716 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004717 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004718 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004719 if (requiredsize < 2 * oldsize)
4720 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004721 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004722 return -1;
4723 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004724 }
4725 return 0;
4726}
4727/* lookup the character, put the result in the output string and adjust
4728 various state variables. Return a new reference to the object that
4729 was put in the output buffer in *result, or Py_None, if the mapping was
4730 undefined (in which case no character was written).
4731 The called must decref result.
4732 Return 0 on success, -1 on error. */
4733static
Walter Dörwald4894c302003-10-24 14:25:28 +00004734int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004735 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004736 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004737{
Walter Dörwald4894c302003-10-24 14:25:28 +00004738 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004739 return -1;
4740 if (*res==NULL) {
4741 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004742 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004743 }
4744 else if (*res==Py_None)
4745 ;
4746 else if (PyInt_Check(*res)) {
4747 /* no overflow check, because we know that the space is enough */
4748 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4749 }
4750 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004751 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004752 if (repsize==1) {
4753 /* no overflow check, because we know that the space is enough */
4754 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4755 }
4756 else if (repsize!=0) {
4757 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004758 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004759 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004760 repsize - 1;
4761 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004762 return -1;
4763 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4764 *outp += repsize;
4765 }
4766 }
4767 else
4768 return -1;
4769 return 0;
4770}
4771
4772PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004773 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004774 PyObject *mapping,
4775 const char *errors)
4776{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004777 /* output object */
4778 PyObject *res = NULL;
4779 /* pointers to the beginning and end+1 of input */
4780 const Py_UNICODE *startp = p;
4781 const Py_UNICODE *endp = p + size;
4782 /* pointer into the output */
4783 Py_UNICODE *str;
4784 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004785 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004786 char *reason = "character maps to <undefined>";
4787 PyObject *errorHandler = NULL;
4788 PyObject *exc = NULL;
4789 /* the following variable is used for caching string comparisons
4790 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4791 * 3=ignore, 4=xmlcharrefreplace */
4792 int known_errorHandler = -1;
4793
Guido van Rossumd57fd912000-03-10 22:53:23 +00004794 if (mapping == NULL) {
4795 PyErr_BadArgument();
4796 return NULL;
4797 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004798
4799 /* allocate enough for a simple 1:1 translation without
4800 replacements, if we need more, we'll resize */
4801 res = PyUnicode_FromUnicode(NULL, size);
4802 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004803 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004804 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004805 return res;
4806 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004807
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004808 while (p<endp) {
4809 /* try to encode it */
4810 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004811 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004812 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004813 goto onError;
4814 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004815 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004816 if (x!=Py_None) /* it worked => adjust input pointer */
4817 ++p;
4818 else { /* untranslatable character */
4819 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004820 Py_ssize_t repsize;
4821 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004822 Py_UNICODE *uni2;
4823 /* startpos for collecting untranslatable chars */
4824 const Py_UNICODE *collstart = p;
4825 const Py_UNICODE *collend = p+1;
4826 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004827
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004828 /* find all untranslatable characters */
4829 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004830 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004831 goto onError;
4832 Py_XDECREF(x);
4833 if (x!=Py_None)
4834 break;
4835 ++collend;
4836 }
4837 /* cache callback name lookup
4838 * (if not done yet, i.e. it's the first error) */
4839 if (known_errorHandler==-1) {
4840 if ((errors==NULL) || (!strcmp(errors, "strict")))
4841 known_errorHandler = 1;
4842 else if (!strcmp(errors, "replace"))
4843 known_errorHandler = 2;
4844 else if (!strcmp(errors, "ignore"))
4845 known_errorHandler = 3;
4846 else if (!strcmp(errors, "xmlcharrefreplace"))
4847 known_errorHandler = 4;
4848 else
4849 known_errorHandler = 0;
4850 }
4851 switch (known_errorHandler) {
4852 case 1: /* strict */
4853 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4854 goto onError;
4855 case 2: /* replace */
4856 /* No need to check for space, this is a 1:1 replacement */
4857 for (coll = collstart; coll<collend; ++coll)
4858 *str++ = '?';
4859 /* fall through */
4860 case 3: /* ignore */
4861 p = collend;
4862 break;
4863 case 4: /* xmlcharrefreplace */
4864 /* generate replacement (temporarily (mis)uses p) */
4865 for (p = collstart; p < collend; ++p) {
4866 char buffer[2+29+1+1];
4867 char *cp;
4868 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004869 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004870 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4871 goto onError;
4872 for (cp = buffer; *cp; ++cp)
4873 *str++ = *cp;
4874 }
4875 p = collend;
4876 break;
4877 default:
4878 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4879 reason, startp, size, &exc,
4880 collstart-startp, collend-startp, &newpos);
4881 if (repunicode == NULL)
4882 goto onError;
4883 /* generate replacement */
4884 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004885 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004886 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4887 Py_DECREF(repunicode);
4888 goto onError;
4889 }
4890 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4891 *str++ = *uni2;
4892 p = startp + newpos;
4893 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004894 }
4895 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004896 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004897 /* Resize if we allocated to much */
4898 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004899 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004900 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004901 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004902 }
4903 Py_XDECREF(exc);
4904 Py_XDECREF(errorHandler);
4905 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004906
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004907 onError:
4908 Py_XDECREF(res);
4909 Py_XDECREF(exc);
4910 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004911 return NULL;
4912}
4913
4914PyObject *PyUnicode_Translate(PyObject *str,
4915 PyObject *mapping,
4916 const char *errors)
4917{
4918 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004919
Guido van Rossumd57fd912000-03-10 22:53:23 +00004920 str = PyUnicode_FromObject(str);
4921 if (str == NULL)
4922 goto onError;
4923 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4924 PyUnicode_GET_SIZE(str),
4925 mapping,
4926 errors);
4927 Py_DECREF(str);
4928 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004929
Guido van Rossumd57fd912000-03-10 22:53:23 +00004930 onError:
4931 Py_XDECREF(str);
4932 return NULL;
4933}
Tim Petersced69f82003-09-16 20:30:58 +00004934
Guido van Rossum9e896b32000-04-05 20:11:21 +00004935/* --- Decimal Encoder ---------------------------------------------------- */
4936
4937int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004938 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004939 char *output,
4940 const char *errors)
4941{
4942 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004943 PyObject *errorHandler = NULL;
4944 PyObject *exc = NULL;
4945 const char *encoding = "decimal";
4946 const char *reason = "invalid decimal Unicode string";
4947 /* the following variable is used for caching string comparisons
4948 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4949 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004950
4951 if (output == NULL) {
4952 PyErr_BadArgument();
4953 return -1;
4954 }
4955
4956 p = s;
4957 end = s + length;
4958 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004959 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004960 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004961 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004962 Py_ssize_t repsize;
4963 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004964 Py_UNICODE *uni2;
4965 Py_UNICODE *collstart;
4966 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004967
Guido van Rossum9e896b32000-04-05 20:11:21 +00004968 if (Py_UNICODE_ISSPACE(ch)) {
4969 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004970 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004971 continue;
4972 }
4973 decimal = Py_UNICODE_TODECIMAL(ch);
4974 if (decimal >= 0) {
4975 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004976 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004977 continue;
4978 }
Guido van Rossumba477042000-04-06 18:18:10 +00004979 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004980 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004981 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004982 continue;
4983 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004984 /* All other characters are considered unencodable */
4985 collstart = p;
4986 collend = p+1;
4987 while (collend < end) {
4988 if ((0 < *collend && *collend < 256) ||
4989 !Py_UNICODE_ISSPACE(*collend) ||
4990 Py_UNICODE_TODECIMAL(*collend))
4991 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004992 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004993 /* cache callback name lookup
4994 * (if not done yet, i.e. it's the first error) */
4995 if (known_errorHandler==-1) {
4996 if ((errors==NULL) || (!strcmp(errors, "strict")))
4997 known_errorHandler = 1;
4998 else if (!strcmp(errors, "replace"))
4999 known_errorHandler = 2;
5000 else if (!strcmp(errors, "ignore"))
5001 known_errorHandler = 3;
5002 else if (!strcmp(errors, "xmlcharrefreplace"))
5003 known_errorHandler = 4;
5004 else
5005 known_errorHandler = 0;
5006 }
5007 switch (known_errorHandler) {
5008 case 1: /* strict */
5009 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5010 goto onError;
5011 case 2: /* replace */
5012 for (p = collstart; p < collend; ++p)
5013 *output++ = '?';
5014 /* fall through */
5015 case 3: /* ignore */
5016 p = collend;
5017 break;
5018 case 4: /* xmlcharrefreplace */
5019 /* generate replacement (temporarily (mis)uses p) */
5020 for (p = collstart; p < collend; ++p)
5021 output += sprintf(output, "&#%d;", (int)*p);
5022 p = collend;
5023 break;
5024 default:
5025 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5026 encoding, reason, s, length, &exc,
5027 collstart-s, collend-s, &newpos);
5028 if (repunicode == NULL)
5029 goto onError;
5030 /* generate replacement */
5031 repsize = PyUnicode_GET_SIZE(repunicode);
5032 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5033 Py_UNICODE ch = *uni2;
5034 if (Py_UNICODE_ISSPACE(ch))
5035 *output++ = ' ';
5036 else {
5037 decimal = Py_UNICODE_TODECIMAL(ch);
5038 if (decimal >= 0)
5039 *output++ = '0' + decimal;
5040 else if (0 < ch && ch < 256)
5041 *output++ = (char)ch;
5042 else {
5043 Py_DECREF(repunicode);
5044 raise_encode_exception(&exc, encoding,
5045 s, length, collstart-s, collend-s, reason);
5046 goto onError;
5047 }
5048 }
5049 }
5050 p = s + newpos;
5051 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005052 }
5053 }
5054 /* 0-terminate the output string */
5055 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005056 Py_XDECREF(exc);
5057 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005058 return 0;
5059
5060 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005061 Py_XDECREF(exc);
5062 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005063 return -1;
5064}
5065
Guido van Rossumd57fd912000-03-10 22:53:23 +00005066/* --- Helpers ------------------------------------------------------------ */
5067
Eric Smitha9f7d622008-02-17 19:46:49 +00005068#include "stringlib/unicodedefs.h"
Fredrik Lundh6471ee42006-05-24 14:28:11 +00005069
Facundo Batista6f7e6fb2007-11-16 19:16:15 +00005070#define FROM_UNICODE
Fredrik Lundhb9479482006-05-26 17:22:38 +00005071
Fredrik Lundha50d2012006-05-26 17:04:58 +00005072#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005073
5074#include "stringlib/count.h"
5075#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005076#include "stringlib/partition.h"
5077
Fredrik Lundhc8162812006-05-26 19:33:03 +00005078/* helper macro to fixup start/end slice values */
5079#define FIX_START_END(obj) \
5080 if (start < 0) \
5081 start += (obj)->length; \
5082 if (start < 0) \
5083 start = 0; \
5084 if (end > (obj)->length) \
5085 end = (obj)->length; \
5086 if (end < 0) \
5087 end += (obj)->length; \
5088 if (end < 0) \
5089 end = 0;
5090
Martin v. Löwis18e16552006-02-15 17:27:45 +00005091Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005092 PyObject *substr,
5093 Py_ssize_t start,
5094 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005095{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005096 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005097 PyUnicodeObject* str_obj;
5098 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005099
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005100 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5101 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005102 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005103 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5104 if (!sub_obj) {
5105 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005106 return -1;
5107 }
Tim Petersced69f82003-09-16 20:30:58 +00005108
Fredrik Lundhc8162812006-05-26 19:33:03 +00005109 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005110
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005111 result = stringlib_count(
5112 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5113 );
5114
5115 Py_DECREF(sub_obj);
5116 Py_DECREF(str_obj);
5117
Guido van Rossumd57fd912000-03-10 22:53:23 +00005118 return result;
5119}
5120
Martin v. Löwis18e16552006-02-15 17:27:45 +00005121Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005122 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005123 Py_ssize_t start,
5124 Py_ssize_t end,
5125 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005126{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005127 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005128
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005129 str = PyUnicode_FromObject(str);
5130 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005131 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005132 sub = PyUnicode_FromObject(sub);
5133 if (!sub) {
5134 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005135 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005136 }
Tim Petersced69f82003-09-16 20:30:58 +00005137
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005138 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005139 result = stringlib_find_slice(
5140 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5141 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5142 start, end
5143 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005144 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005145 result = stringlib_rfind_slice(
5146 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5147 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5148 start, end
5149 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005150
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005151 Py_DECREF(str);
5152 Py_DECREF(sub);
5153
Guido van Rossumd57fd912000-03-10 22:53:23 +00005154 return result;
5155}
5156
Tim Petersced69f82003-09-16 20:30:58 +00005157static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005158int tailmatch(PyUnicodeObject *self,
5159 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005160 Py_ssize_t start,
5161 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005162 int direction)
5163{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005164 if (substring->length == 0)
5165 return 1;
5166
Fredrik Lundhc8162812006-05-26 19:33:03 +00005167 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005168
5169 end -= substring->length;
5170 if (end < start)
5171 return 0;
5172
5173 if (direction > 0) {
5174 if (Py_UNICODE_MATCH(self, end, substring))
5175 return 1;
5176 } else {
5177 if (Py_UNICODE_MATCH(self, start, substring))
5178 return 1;
5179 }
5180
5181 return 0;
5182}
5183
Martin v. Löwis18e16552006-02-15 17:27:45 +00005184Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005185 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005186 Py_ssize_t start,
5187 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005188 int direction)
5189{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005190 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005191
Guido van Rossumd57fd912000-03-10 22:53:23 +00005192 str = PyUnicode_FromObject(str);
5193 if (str == NULL)
5194 return -1;
5195 substr = PyUnicode_FromObject(substr);
5196 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005197 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005198 return -1;
5199 }
Tim Petersced69f82003-09-16 20:30:58 +00005200
Guido van Rossumd57fd912000-03-10 22:53:23 +00005201 result = tailmatch((PyUnicodeObject *)str,
5202 (PyUnicodeObject *)substr,
5203 start, end, direction);
5204 Py_DECREF(str);
5205 Py_DECREF(substr);
5206 return result;
5207}
5208
Guido van Rossumd57fd912000-03-10 22:53:23 +00005209/* Apply fixfct filter to the Unicode object self and return a
5210 reference to the modified object */
5211
Tim Petersced69f82003-09-16 20:30:58 +00005212static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005213PyObject *fixup(PyUnicodeObject *self,
5214 int (*fixfct)(PyUnicodeObject *s))
5215{
5216
5217 PyUnicodeObject *u;
5218
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005219 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005220 if (u == NULL)
5221 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005222
5223 Py_UNICODE_COPY(u->str, self->str, self->length);
5224
Tim Peters7a29bd52001-09-12 03:03:31 +00005225 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005226 /* fixfct should return TRUE if it modified the buffer. If
5227 FALSE, return a reference to the original buffer instead
5228 (to save space, not time) */
5229 Py_INCREF(self);
5230 Py_DECREF(u);
5231 return (PyObject*) self;
5232 }
5233 return (PyObject*) u;
5234}
5235
Tim Petersced69f82003-09-16 20:30:58 +00005236static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005237int fixupper(PyUnicodeObject *self)
5238{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005239 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005240 Py_UNICODE *s = self->str;
5241 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005242
Guido van Rossumd57fd912000-03-10 22:53:23 +00005243 while (len-- > 0) {
5244 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005245
Guido van Rossumd57fd912000-03-10 22:53:23 +00005246 ch = Py_UNICODE_TOUPPER(*s);
5247 if (ch != *s) {
5248 status = 1;
5249 *s = ch;
5250 }
5251 s++;
5252 }
5253
5254 return status;
5255}
5256
Tim Petersced69f82003-09-16 20:30:58 +00005257static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005258int fixlower(PyUnicodeObject *self)
5259{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005260 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005261 Py_UNICODE *s = self->str;
5262 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005263
Guido van Rossumd57fd912000-03-10 22:53:23 +00005264 while (len-- > 0) {
5265 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005266
Guido van Rossumd57fd912000-03-10 22:53:23 +00005267 ch = Py_UNICODE_TOLOWER(*s);
5268 if (ch != *s) {
5269 status = 1;
5270 *s = ch;
5271 }
5272 s++;
5273 }
5274
5275 return status;
5276}
5277
Tim Petersced69f82003-09-16 20:30:58 +00005278static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005279int fixswapcase(PyUnicodeObject *self)
5280{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005281 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005282 Py_UNICODE *s = self->str;
5283 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005284
Guido van Rossumd57fd912000-03-10 22:53:23 +00005285 while (len-- > 0) {
5286 if (Py_UNICODE_ISUPPER(*s)) {
5287 *s = Py_UNICODE_TOLOWER(*s);
5288 status = 1;
5289 } else if (Py_UNICODE_ISLOWER(*s)) {
5290 *s = Py_UNICODE_TOUPPER(*s);
5291 status = 1;
5292 }
5293 s++;
5294 }
5295
5296 return status;
5297}
5298
Tim Petersced69f82003-09-16 20:30:58 +00005299static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005300int fixcapitalize(PyUnicodeObject *self)
5301{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005302 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005303 Py_UNICODE *s = self->str;
5304 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005305
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005306 if (len == 0)
5307 return 0;
5308 if (Py_UNICODE_ISLOWER(*s)) {
5309 *s = Py_UNICODE_TOUPPER(*s);
5310 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005311 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005312 s++;
5313 while (--len > 0) {
5314 if (Py_UNICODE_ISUPPER(*s)) {
5315 *s = Py_UNICODE_TOLOWER(*s);
5316 status = 1;
5317 }
5318 s++;
5319 }
5320 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005321}
5322
5323static
5324int fixtitle(PyUnicodeObject *self)
5325{
5326 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5327 register Py_UNICODE *e;
5328 int previous_is_cased;
5329
5330 /* Shortcut for single character strings */
5331 if (PyUnicode_GET_SIZE(self) == 1) {
5332 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5333 if (*p != ch) {
5334 *p = ch;
5335 return 1;
5336 }
5337 else
5338 return 0;
5339 }
Tim Petersced69f82003-09-16 20:30:58 +00005340
Guido van Rossumd57fd912000-03-10 22:53:23 +00005341 e = p + PyUnicode_GET_SIZE(self);
5342 previous_is_cased = 0;
5343 for (; p < e; p++) {
5344 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005345
Guido van Rossumd57fd912000-03-10 22:53:23 +00005346 if (previous_is_cased)
5347 *p = Py_UNICODE_TOLOWER(ch);
5348 else
5349 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005350
5351 if (Py_UNICODE_ISLOWER(ch) ||
5352 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005353 Py_UNICODE_ISTITLE(ch))
5354 previous_is_cased = 1;
5355 else
5356 previous_is_cased = 0;
5357 }
5358 return 1;
5359}
5360
Tim Peters8ce9f162004-08-27 01:49:32 +00005361PyObject *
5362PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005363{
Tim Peters8ce9f162004-08-27 01:49:32 +00005364 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005365 const Py_UNICODE blank = ' ';
5366 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00005367 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005368 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00005369 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5370 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005371 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5372 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005373 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005374 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005375 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005376
Tim Peters05eba1f2004-08-27 21:32:02 +00005377 fseq = PySequence_Fast(seq, "");
5378 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005379 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005380 }
5381
Tim Peters91879ab2004-08-27 22:35:44 +00005382 /* Grrrr. A codec may be invoked to convert str objects to
5383 * Unicode, and so it's possible to call back into Python code
5384 * during PyUnicode_FromObject(), and so it's possible for a sick
5385 * codec to change the size of fseq (if seq is a list). Therefore
5386 * we have to keep refetching the size -- can't assume seqlen
5387 * is invariant.
5388 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005389 seqlen = PySequence_Fast_GET_SIZE(fseq);
5390 /* If empty sequence, return u"". */
5391 if (seqlen == 0) {
5392 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5393 goto Done;
5394 }
5395 /* If singleton sequence with an exact Unicode, return that. */
5396 if (seqlen == 1) {
5397 item = PySequence_Fast_GET_ITEM(fseq, 0);
5398 if (PyUnicode_CheckExact(item)) {
5399 Py_INCREF(item);
5400 res = (PyUnicodeObject *)item;
5401 goto Done;
5402 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005403 }
5404
Tim Peters05eba1f2004-08-27 21:32:02 +00005405 /* At least two items to join, or one that isn't exact Unicode. */
5406 if (seqlen > 1) {
5407 /* Set up sep and seplen -- they're needed. */
5408 if (separator == NULL) {
5409 sep = &blank;
5410 seplen = 1;
5411 }
5412 else {
5413 internal_separator = PyUnicode_FromObject(separator);
5414 if (internal_separator == NULL)
5415 goto onError;
5416 sep = PyUnicode_AS_UNICODE(internal_separator);
5417 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00005418 /* In case PyUnicode_FromObject() mutated seq. */
5419 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005420 }
5421 }
5422
5423 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005424 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005425 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005426 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005427 res_p = PyUnicode_AS_UNICODE(res);
5428 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005429
Tim Peters05eba1f2004-08-27 21:32:02 +00005430 for (i = 0; i < seqlen; ++i) {
Tim Peters286085c2006-05-22 19:17:04 +00005431 Py_ssize_t itemlen;
5432 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005433
5434 item = PySequence_Fast_GET_ITEM(fseq, i);
5435 /* Convert item to Unicode. */
5436 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5437 PyErr_Format(PyExc_TypeError,
Thomas Wouters715a4cd2006-04-16 22:04:49 +00005438 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00005439 " %.80s found",
Christian Heimese93237d2007-12-19 02:37:44 +00005440 i, Py_TYPE(item)->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00005441 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005442 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005443 item = PyUnicode_FromObject(item);
5444 if (item == NULL)
5445 goto onError;
5446 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005447
Tim Peters91879ab2004-08-27 22:35:44 +00005448 /* In case PyUnicode_FromObject() mutated seq. */
5449 seqlen = PySequence_Fast_GET_SIZE(fseq);
5450
Tim Peters8ce9f162004-08-27 01:49:32 +00005451 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005452 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005453 new_res_used = res_used + itemlen;
Georg Brandl90e27d32006-06-10 06:40:50 +00005454 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005455 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005456 if (i < seqlen - 1) {
5457 new_res_used += seplen;
Georg Brandl90e27d32006-06-10 06:40:50 +00005458 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00005459 goto Overflow;
5460 }
5461 if (new_res_used > res_alloc) {
5462 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00005463 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00005464 res_alloc += res_alloc;
Tim Peters286085c2006-05-22 19:17:04 +00005465 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005466 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005467 } while (new_res_used > res_alloc);
Martin v. Löwis412fb672006-04-13 06:34:32 +00005468 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005469 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005470 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005471 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005472 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005473 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005474
5475 /* Copy item, and maybe the separator. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005476 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005477 res_p += itemlen;
5478 if (i < seqlen - 1) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00005479 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005480 res_p += seplen;
5481 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005482 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005483 res_used = new_res_used;
5484 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005485
Tim Peters05eba1f2004-08-27 21:32:02 +00005486 /* Shrink res to match the used area; this probably can't fail,
5487 * but it's cheap to check.
5488 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005489 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005490 goto onError;
5491
5492 Done:
5493 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005494 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005495 return (PyObject *)res;
5496
Tim Peters8ce9f162004-08-27 01:49:32 +00005497 Overflow:
5498 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00005499 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005500 Py_DECREF(item);
5501 /* fall through */
5502
Guido van Rossumd57fd912000-03-10 22:53:23 +00005503 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005504 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005505 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005506 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005507 return NULL;
5508}
5509
Tim Petersced69f82003-09-16 20:30:58 +00005510static
5511PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005512 Py_ssize_t left,
5513 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005514 Py_UNICODE fill)
5515{
5516 PyUnicodeObject *u;
5517
5518 if (left < 0)
5519 left = 0;
5520 if (right < 0)
5521 right = 0;
5522
Tim Peters7a29bd52001-09-12 03:03:31 +00005523 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005524 Py_INCREF(self);
5525 return self;
5526 }
5527
5528 u = _PyUnicode_New(left + self->length + right);
5529 if (u) {
5530 if (left)
5531 Py_UNICODE_FILL(u->str, fill, left);
5532 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5533 if (right)
5534 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5535 }
5536
5537 return u;
5538}
5539
5540#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005541 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005542 if (!str) \
5543 goto onError; \
5544 if (PyList_Append(list, str)) { \
5545 Py_DECREF(str); \
5546 goto onError; \
5547 } \
5548 else \
5549 Py_DECREF(str);
5550
5551static
5552PyObject *split_whitespace(PyUnicodeObject *self,
5553 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005554 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005555{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005556 register Py_ssize_t i;
5557 register Py_ssize_t j;
5558 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005559 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005560 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005561
5562 for (i = j = 0; i < len; ) {
5563 /* find a token */
Christian Heimes4d4f2702008-01-30 11:32:37 +00005564 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005565 i++;
5566 j = i;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005567 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005568 i++;
5569 if (j < i) {
5570 if (maxcount-- <= 0)
5571 break;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005572 SPLIT_APPEND(buf, j, i);
5573 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005574 i++;
5575 j = i;
5576 }
5577 }
5578 if (j < len) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005579 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005580 }
5581 return list;
5582
5583 onError:
5584 Py_DECREF(list);
5585 return NULL;
5586}
5587
5588PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005589 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005590{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005591 register Py_ssize_t i;
5592 register Py_ssize_t j;
5593 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005594 PyObject *list;
5595 PyObject *str;
5596 Py_UNICODE *data;
5597
5598 string = PyUnicode_FromObject(string);
5599 if (string == NULL)
5600 return NULL;
5601 data = PyUnicode_AS_UNICODE(string);
5602 len = PyUnicode_GET_SIZE(string);
5603
Guido van Rossumd57fd912000-03-10 22:53:23 +00005604 list = PyList_New(0);
5605 if (!list)
5606 goto onError;
5607
5608 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005609 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005610
Guido van Rossumd57fd912000-03-10 22:53:23 +00005611 /* Find a line and append it */
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005612 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005613 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005614
5615 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005616 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005617 if (i < len) {
5618 if (data[i] == '\r' && i + 1 < len &&
5619 data[i+1] == '\n')
5620 i += 2;
5621 else
5622 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005623 if (keepends)
5624 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005625 }
Guido van Rossum86662912000-04-11 15:38:46 +00005626 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005627 j = i;
5628 }
5629 if (j < len) {
5630 SPLIT_APPEND(data, j, len);
5631 }
5632
5633 Py_DECREF(string);
5634 return list;
5635
5636 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005637 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005638 Py_DECREF(string);
5639 return NULL;
5640}
5641
Tim Petersced69f82003-09-16 20:30:58 +00005642static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005643PyObject *split_char(PyUnicodeObject *self,
5644 PyObject *list,
5645 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005646 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005647{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005648 register Py_ssize_t i;
5649 register Py_ssize_t j;
5650 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005651 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005652 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005653
5654 for (i = j = 0; i < len; ) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005655 if (buf[i] == ch) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005656 if (maxcount-- <= 0)
5657 break;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005658 SPLIT_APPEND(buf, j, i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005659 i = j = i + 1;
5660 } else
5661 i++;
5662 }
5663 if (j <= len) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005664 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005665 }
5666 return list;
5667
5668 onError:
5669 Py_DECREF(list);
5670 return NULL;
5671}
5672
Tim Petersced69f82003-09-16 20:30:58 +00005673static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005674PyObject *split_substring(PyUnicodeObject *self,
5675 PyObject *list,
5676 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005677 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005678{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005679 register Py_ssize_t i;
5680 register Py_ssize_t j;
5681 Py_ssize_t len = self->length;
5682 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005683 PyObject *str;
5684
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005685 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005686 if (Py_UNICODE_MATCH(self, i, substring)) {
5687 if (maxcount-- <= 0)
5688 break;
5689 SPLIT_APPEND(self->str, j, i);
5690 i = j = i + sublen;
5691 } else
5692 i++;
5693 }
5694 if (j <= len) {
5695 SPLIT_APPEND(self->str, j, len);
5696 }
5697 return list;
5698
5699 onError:
5700 Py_DECREF(list);
5701 return NULL;
5702}
5703
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005704static
5705PyObject *rsplit_whitespace(PyUnicodeObject *self,
5706 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005707 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005708{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005709 register Py_ssize_t i;
5710 register Py_ssize_t j;
5711 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005712 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005713 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005714
5715 for (i = j = len - 1; i >= 0; ) {
5716 /* find a token */
Christian Heimes4d4f2702008-01-30 11:32:37 +00005717 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005718 i--;
5719 j = i;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005720 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005721 i--;
5722 if (j > i) {
5723 if (maxcount-- <= 0)
5724 break;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005725 SPLIT_APPEND(buf, i + 1, j + 1);
5726 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005727 i--;
5728 j = i;
5729 }
5730 }
5731 if (j >= 0) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005732 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005733 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005734 if (PyList_Reverse(list) < 0)
5735 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005736 return list;
5737
5738 onError:
5739 Py_DECREF(list);
5740 return NULL;
5741}
5742
5743static
5744PyObject *rsplit_char(PyUnicodeObject *self,
5745 PyObject *list,
5746 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005747 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005748{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005749 register Py_ssize_t i;
5750 register Py_ssize_t j;
5751 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005752 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005753 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005754
5755 for (i = j = len - 1; i >= 0; ) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005756 if (buf[i] == ch) {
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005757 if (maxcount-- <= 0)
5758 break;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005759 SPLIT_APPEND(buf, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005760 j = i = i - 1;
5761 } else
5762 i--;
5763 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005764 if (j >= -1) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005765 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005766 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005767 if (PyList_Reverse(list) < 0)
5768 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005769 return list;
5770
5771 onError:
5772 Py_DECREF(list);
5773 return NULL;
5774}
5775
5776static
5777PyObject *rsplit_substring(PyUnicodeObject *self,
5778 PyObject *list,
5779 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005780 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005781{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005782 register Py_ssize_t i;
5783 register Py_ssize_t j;
5784 Py_ssize_t len = self->length;
5785 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005786 PyObject *str;
5787
5788 for (i = len - sublen, j = len; i >= 0; ) {
5789 if (Py_UNICODE_MATCH(self, i, substring)) {
5790 if (maxcount-- <= 0)
5791 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005792 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005793 j = i;
5794 i -= sublen;
5795 } else
5796 i--;
5797 }
5798 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005799 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005800 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005801 if (PyList_Reverse(list) < 0)
5802 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005803 return list;
5804
5805 onError:
5806 Py_DECREF(list);
5807 return NULL;
5808}
5809
Guido van Rossumd57fd912000-03-10 22:53:23 +00005810#undef SPLIT_APPEND
5811
5812static
5813PyObject *split(PyUnicodeObject *self,
5814 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005815 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005816{
5817 PyObject *list;
5818
5819 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005820 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005821
5822 list = PyList_New(0);
5823 if (!list)
5824 return NULL;
5825
5826 if (substring == NULL)
5827 return split_whitespace(self,list,maxcount);
5828
5829 else if (substring->length == 1)
5830 return split_char(self,list,substring->str[0],maxcount);
5831
5832 else if (substring->length == 0) {
5833 Py_DECREF(list);
5834 PyErr_SetString(PyExc_ValueError, "empty separator");
5835 return NULL;
5836 }
5837 else
5838 return split_substring(self,list,substring,maxcount);
5839}
5840
Tim Petersced69f82003-09-16 20:30:58 +00005841static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005842PyObject *rsplit(PyUnicodeObject *self,
5843 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005844 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005845{
5846 PyObject *list;
5847
5848 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005849 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005850
5851 list = PyList_New(0);
5852 if (!list)
5853 return NULL;
5854
5855 if (substring == NULL)
5856 return rsplit_whitespace(self,list,maxcount);
5857
5858 else if (substring->length == 1)
5859 return rsplit_char(self,list,substring->str[0],maxcount);
5860
5861 else if (substring->length == 0) {
5862 Py_DECREF(list);
5863 PyErr_SetString(PyExc_ValueError, "empty separator");
5864 return NULL;
5865 }
5866 else
5867 return rsplit_substring(self,list,substring,maxcount);
5868}
5869
5870static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005871PyObject *replace(PyUnicodeObject *self,
5872 PyUnicodeObject *str1,
5873 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005874 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005875{
5876 PyUnicodeObject *u;
5877
5878 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005879 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005880
Fredrik Lundh347ee272006-05-24 16:35:18 +00005881 if (str1->length == str2->length) {
5882 /* same length */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005883 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005884 if (str1->length == 1) {
5885 /* replace characters */
5886 Py_UNICODE u1, u2;
5887 if (!findchar(self->str, self->length, str1->str[0]))
5888 goto nothing;
5889 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5890 if (!u)
5891 return NULL;
5892 Py_UNICODE_COPY(u->str, self->str, self->length);
5893 u1 = str1->str[0];
5894 u2 = str2->str[0];
5895 for (i = 0; i < u->length; i++)
5896 if (u->str[i] == u1) {
5897 if (--maxcount < 0)
5898 break;
5899 u->str[i] = u2;
5900 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005901 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005902 i = fastsearch(
5903 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005904 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00005905 if (i < 0)
5906 goto nothing;
5907 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5908 if (!u)
5909 return NULL;
5910 Py_UNICODE_COPY(u->str, self->str, self->length);
5911 while (i <= self->length - str1->length)
5912 if (Py_UNICODE_MATCH(self, i, str1)) {
5913 if (--maxcount < 0)
5914 break;
5915 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5916 i += str1->length;
5917 } else
5918 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005919 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005920 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005921
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005922 Py_ssize_t n, i, j, e;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005923 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005924 Py_UNICODE *p;
5925
5926 /* replace strings */
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005927 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005928 if (n > maxcount)
5929 n = maxcount;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005930 if (n == 0)
5931 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005932 /* new_size = self->length + n * (str2->length - str1->length)); */
5933 delta = (str2->length - str1->length);
5934 if (delta == 0) {
5935 new_size = self->length;
5936 } else {
5937 product = n * (str2->length - str1->length);
5938 if ((product / (str2->length - str1->length)) != n) {
5939 PyErr_SetString(PyExc_OverflowError,
5940 "replace string is too long");
5941 return NULL;
5942 }
5943 new_size = self->length + product;
5944 if (new_size < 0) {
5945 PyErr_SetString(PyExc_OverflowError,
5946 "replace string is too long");
5947 return NULL;
5948 }
5949 }
5950 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005951 if (!u)
5952 return NULL;
5953 i = 0;
5954 p = u->str;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005955 e = self->length - str1->length;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005956 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005957 while (n-- > 0) {
5958 /* look for next match */
5959 j = i;
5960 while (j <= e) {
5961 if (Py_UNICODE_MATCH(self, j, str1))
5962 break;
5963 j++;
5964 }
5965 if (j > i) {
5966 if (j > e)
5967 break;
5968 /* copy unchanged part [i:j] */
5969 Py_UNICODE_COPY(p, self->str+i, j-i);
5970 p += j - i;
5971 }
5972 /* copy substitution string */
5973 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005974 Py_UNICODE_COPY(p, str2->str, str2->length);
5975 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005976 }
5977 i = j + str1->length;
5978 }
5979 if (i < self->length)
5980 /* copy tail [i:] */
5981 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005982 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005983 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00005984 while (n > 0) {
5985 Py_UNICODE_COPY(p, str2->str, str2->length);
5986 p += str2->length;
5987 if (--n <= 0)
5988 break;
5989 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005990 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00005991 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005992 }
5993 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005994 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005995
5996nothing:
5997 /* nothing to replace; return original string (when possible) */
5998 if (PyUnicode_CheckExact(self)) {
5999 Py_INCREF(self);
6000 return (PyObject *) self;
6001 }
6002 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006003}
6004
6005/* --- Unicode Object Methods --------------------------------------------- */
6006
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006007PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006008"S.title() -> unicode\n\
6009\n\
6010Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006011characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006012
6013static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006014unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006015{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006016 return fixup(self, fixtitle);
6017}
6018
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006019PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006020"S.capitalize() -> unicode\n\
6021\n\
6022Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006023have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006024
6025static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006026unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006027{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028 return fixup(self, fixcapitalize);
6029}
6030
6031#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006032PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006033"S.capwords() -> unicode\n\
6034\n\
6035Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006036normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006037
6038static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006039unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006040{
6041 PyObject *list;
6042 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006043 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006044
Guido van Rossumd57fd912000-03-10 22:53:23 +00006045 /* Split into words */
6046 list = split(self, NULL, -1);
6047 if (!list)
6048 return NULL;
6049
6050 /* Capitalize each word */
6051 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6052 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6053 fixcapitalize);
6054 if (item == NULL)
6055 goto onError;
6056 Py_DECREF(PyList_GET_ITEM(list, i));
6057 PyList_SET_ITEM(list, i, item);
6058 }
6059
6060 /* Join the words to form a new string */
6061 item = PyUnicode_Join(NULL, list);
6062
6063onError:
6064 Py_DECREF(list);
6065 return (PyObject *)item;
6066}
6067#endif
6068
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006069/* Argument converter. Coerces to a single unicode character */
6070
6071static int
6072convert_uc(PyObject *obj, void *addr)
6073{
6074 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6075 PyObject *uniobj;
6076 Py_UNICODE *unistr;
6077
6078 uniobj = PyUnicode_FromObject(obj);
6079 if (uniobj == NULL) {
6080 PyErr_SetString(PyExc_TypeError,
6081 "The fill character cannot be converted to Unicode");
6082 return 0;
6083 }
6084 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6085 PyErr_SetString(PyExc_TypeError,
6086 "The fill character must be exactly one character long");
6087 Py_DECREF(uniobj);
6088 return 0;
6089 }
6090 unistr = PyUnicode_AS_UNICODE(uniobj);
6091 *fillcharloc = unistr[0];
6092 Py_DECREF(uniobj);
6093 return 1;
6094}
6095
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006096PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006097"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006098\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006099Return S centered in a Unicode string of length width. Padding is\n\
6100done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006101
6102static PyObject *
6103unicode_center(PyUnicodeObject *self, PyObject *args)
6104{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006105 Py_ssize_t marg, left;
6106 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006107 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006108
Thomas Woutersde017742006-02-16 19:34:37 +00006109 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006110 return NULL;
6111
Tim Peters7a29bd52001-09-12 03:03:31 +00006112 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006113 Py_INCREF(self);
6114 return (PyObject*) self;
6115 }
6116
6117 marg = width - self->length;
6118 left = marg / 2 + (marg & width & 1);
6119
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006120 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006121}
6122
Marc-André Lemburge5034372000-08-08 08:04:29 +00006123#if 0
6124
6125/* This code should go into some future Unicode collation support
6126 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006127 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006128
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006129/* speedy UTF-16 code point order comparison */
6130/* gleaned from: */
6131/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6132
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006133static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006134{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006135 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006136 0, 0, 0, 0, 0, 0, 0, 0,
6137 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006138 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006139};
6140
Guido van Rossumd57fd912000-03-10 22:53:23 +00006141static int
6142unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6143{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006144 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006145
Guido van Rossumd57fd912000-03-10 22:53:23 +00006146 Py_UNICODE *s1 = str1->str;
6147 Py_UNICODE *s2 = str2->str;
6148
6149 len1 = str1->length;
6150 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006151
Guido van Rossumd57fd912000-03-10 22:53:23 +00006152 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006153 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006154
6155 c1 = *s1++;
6156 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006157
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006158 if (c1 > (1<<11) * 26)
6159 c1 += utf16Fixup[c1>>11];
6160 if (c2 > (1<<11) * 26)
6161 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006162 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006163
6164 if (c1 != c2)
6165 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006166
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006167 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006168 }
6169
6170 return (len1 < len2) ? -1 : (len1 != len2);
6171}
6172
Marc-André Lemburge5034372000-08-08 08:04:29 +00006173#else
6174
6175static int
6176unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6177{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006178 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006179
6180 Py_UNICODE *s1 = str1->str;
6181 Py_UNICODE *s2 = str2->str;
6182
6183 len1 = str1->length;
6184 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006185
Marc-André Lemburge5034372000-08-08 08:04:29 +00006186 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006187 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006188
Fredrik Lundh45714e92001-06-26 16:39:36 +00006189 c1 = *s1++;
6190 c2 = *s2++;
6191
6192 if (c1 != c2)
6193 return (c1 < c2) ? -1 : 1;
6194
Marc-André Lemburge5034372000-08-08 08:04:29 +00006195 len1--; len2--;
6196 }
6197
6198 return (len1 < len2) ? -1 : (len1 != len2);
6199}
6200
6201#endif
6202
Guido van Rossumd57fd912000-03-10 22:53:23 +00006203int PyUnicode_Compare(PyObject *left,
6204 PyObject *right)
6205{
6206 PyUnicodeObject *u = NULL, *v = NULL;
6207 int result;
6208
6209 /* Coerce the two arguments */
6210 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6211 if (u == NULL)
6212 goto onError;
6213 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6214 if (v == NULL)
6215 goto onError;
6216
Thomas Wouters7e474022000-07-16 12:04:32 +00006217 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006218 if (v == u) {
6219 Py_DECREF(u);
6220 Py_DECREF(v);
6221 return 0;
6222 }
6223
6224 result = unicode_compare(u, v);
6225
6226 Py_DECREF(u);
6227 Py_DECREF(v);
6228 return result;
6229
6230onError:
6231 Py_XDECREF(u);
6232 Py_XDECREF(v);
6233 return -1;
6234}
6235
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006236PyObject *PyUnicode_RichCompare(PyObject *left,
6237 PyObject *right,
6238 int op)
6239{
6240 int result;
6241
6242 result = PyUnicode_Compare(left, right);
6243 if (result == -1 && PyErr_Occurred())
6244 goto onError;
6245
6246 /* Convert the return value to a Boolean */
6247 switch (op) {
6248 case Py_EQ:
6249 result = (result == 0);
6250 break;
6251 case Py_NE:
6252 result = (result != 0);
6253 break;
6254 case Py_LE:
6255 result = (result <= 0);
6256 break;
6257 case Py_GE:
6258 result = (result >= 0);
6259 break;
6260 case Py_LT:
6261 result = (result == -1);
6262 break;
6263 case Py_GT:
6264 result = (result == 1);
6265 break;
6266 }
6267 return PyBool_FromLong(result);
6268
6269 onError:
6270
6271 /* Standard case
6272
6273 Type errors mean that PyUnicode_FromObject() could not convert
6274 one of the arguments (usually the right hand side) to Unicode,
6275 ie. we can't handle the comparison request. However, it is
6276 possible that the other object knows a comparison method, which
6277 is why we return Py_NotImplemented to give the other object a
6278 chance.
6279
6280 */
6281 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6282 PyErr_Clear();
6283 Py_INCREF(Py_NotImplemented);
6284 return Py_NotImplemented;
6285 }
6286 if (op != Py_EQ && op != Py_NE)
6287 return NULL;
6288
6289 /* Equality comparison.
6290
6291 This is a special case: we silence any PyExc_UnicodeDecodeError
6292 and instead turn it into a PyErr_UnicodeWarning.
6293
6294 */
6295 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6296 return NULL;
6297 PyErr_Clear();
6298 if (PyErr_Warn(PyExc_UnicodeWarning,
6299 (op == Py_EQ) ?
6300 "Unicode equal comparison "
6301 "failed to convert both arguments to Unicode - "
6302 "interpreting them as being unequal" :
6303 "Unicode unequal comparison "
6304 "failed to convert both arguments to Unicode - "
6305 "interpreting them as being unequal"
6306 ) < 0)
6307 return NULL;
6308 result = (op == Py_NE);
6309 return PyBool_FromLong(result);
6310}
6311
Guido van Rossum403d68b2000-03-13 15:55:09 +00006312int PyUnicode_Contains(PyObject *container,
6313 PyObject *element)
6314{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006315 PyObject *str, *sub;
6316 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006317
6318 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006319 sub = PyUnicode_FromObject(element);
6320 if (!sub) {
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006321 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00006322 "'in <string>' requires string as left operand");
Fredrik Lundh833bf942006-05-23 10:12:21 +00006323 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006324 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00006325
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006326 str = PyUnicode_FromObject(container);
6327 if (!str) {
6328 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00006329 return -1;
6330 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00006331
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006332 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00006333
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006334 Py_DECREF(str);
6335 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00006336
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006337 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006338}
6339
Guido van Rossumd57fd912000-03-10 22:53:23 +00006340/* Concat to string or Unicode object giving a new Unicode object. */
6341
6342PyObject *PyUnicode_Concat(PyObject *left,
6343 PyObject *right)
6344{
6345 PyUnicodeObject *u = NULL, *v = NULL, *w;
6346
6347 /* Coerce the two arguments */
6348 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6349 if (u == NULL)
6350 goto onError;
6351 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6352 if (v == NULL)
6353 goto onError;
6354
6355 /* Shortcuts */
6356 if (v == unicode_empty) {
6357 Py_DECREF(v);
6358 return (PyObject *)u;
6359 }
6360 if (u == unicode_empty) {
6361 Py_DECREF(u);
6362 return (PyObject *)v;
6363 }
6364
6365 /* Concat the two Unicode strings */
6366 w = _PyUnicode_New(u->length + v->length);
6367 if (w == NULL)
6368 goto onError;
6369 Py_UNICODE_COPY(w->str, u->str, u->length);
6370 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6371
6372 Py_DECREF(u);
6373 Py_DECREF(v);
6374 return (PyObject *)w;
6375
6376onError:
6377 Py_XDECREF(u);
6378 Py_XDECREF(v);
6379 return NULL;
6380}
6381
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006382PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006383"S.count(sub[, start[, end]]) -> int\n\
6384\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00006385Return the number of non-overlapping occurrences of substring sub in\n\
6386Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006387interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006388
6389static PyObject *
6390unicode_count(PyUnicodeObject *self, PyObject *args)
6391{
6392 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006393 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006394 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006395 PyObject *result;
6396
Guido van Rossumb8872e62000-05-09 14:14:27 +00006397 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6398 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006399 return NULL;
6400
6401 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006402 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006403 if (substring == NULL)
6404 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006405
Fredrik Lundhc8162812006-05-26 19:33:03 +00006406 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006407
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006408 result = PyInt_FromSsize_t(
6409 stringlib_count(self->str + start, end - start,
6410 substring->str, substring->length)
6411 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006412
6413 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006414
Guido van Rossumd57fd912000-03-10 22:53:23 +00006415 return result;
6416}
6417
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006418PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006419"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006420\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006421Encodes S using the codec registered for encoding. encoding defaults\n\
6422to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006423handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006424a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6425'xmlcharrefreplace' as well as any other name registered with\n\
6426codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006427
6428static PyObject *
6429unicode_encode(PyUnicodeObject *self, PyObject *args)
6430{
6431 char *encoding = NULL;
6432 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006433 PyObject *v;
6434
Guido van Rossumd57fd912000-03-10 22:53:23 +00006435 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6436 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006437 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006438 if (v == NULL)
6439 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006440 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6441 PyErr_Format(PyExc_TypeError,
6442 "encoder did not return a string/unicode object "
6443 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006444 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006445 Py_DECREF(v);
6446 return NULL;
6447 }
6448 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006449
6450 onError:
6451 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006452}
6453
6454PyDoc_STRVAR(decode__doc__,
6455"S.decode([encoding[,errors]]) -> string or unicode\n\
6456\n\
6457Decodes S using the codec registered for encoding. encoding defaults\n\
6458to the default encoding. errors may be given to set a different error\n\
6459handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6460a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6461as well as any other name registerd with codecs.register_error that is\n\
6462able to handle UnicodeDecodeErrors.");
6463
6464static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00006465unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006466{
6467 char *encoding = NULL;
6468 char *errors = NULL;
6469 PyObject *v;
6470
6471 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
6472 return NULL;
6473 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006474 if (v == NULL)
6475 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006476 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6477 PyErr_Format(PyExc_TypeError,
6478 "decoder did not return a string/unicode object "
6479 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006480 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006481 Py_DECREF(v);
6482 return NULL;
6483 }
6484 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006485
6486 onError:
6487 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006488}
6489
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006490PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006491"S.expandtabs([tabsize]) -> unicode\n\
6492\n\
6493Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006494If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006495
6496static PyObject*
6497unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6498{
6499 Py_UNICODE *e;
6500 Py_UNICODE *p;
6501 Py_UNICODE *q;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006502 Py_UNICODE *qe;
6503 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006504 PyUnicodeObject *u;
6505 int tabsize = 8;
6506
6507 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6508 return NULL;
6509
Thomas Wouters7e474022000-07-16 12:04:32 +00006510 /* First pass: determine size of output string */
Guido van Rossum5bdff602008-03-11 21:18:06 +00006511 i = 0; /* chars up to and including most recent \n or \r */
6512 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6513 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006514 for (p = self->str; p < e; p++)
6515 if (*p == '\t') {
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006516 if (tabsize > 0) {
Guido van Rossum5bdff602008-03-11 21:18:06 +00006517 incr = tabsize - (j % tabsize); /* cannot overflow */
6518 if (j > PY_SSIZE_T_MAX - incr)
6519 goto overflow1;
6520 j += incr;
6521 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006522 }
6523 else {
Guido van Rossum5bdff602008-03-11 21:18:06 +00006524 if (j > PY_SSIZE_T_MAX - 1)
6525 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006526 j++;
6527 if (*p == '\n' || *p == '\r') {
Guido van Rossum5bdff602008-03-11 21:18:06 +00006528 if (i > PY_SSIZE_T_MAX - j)
6529 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006530 i += j;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006531 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006532 }
6533 }
6534
Guido van Rossum5bdff602008-03-11 21:18:06 +00006535 if (i > PY_SSIZE_T_MAX - j)
6536 goto overflow1;
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006537
Guido van Rossumd57fd912000-03-10 22:53:23 +00006538 /* Second pass: create output string and fill it */
6539 u = _PyUnicode_New(i + j);
6540 if (!u)
6541 return NULL;
6542
Guido van Rossum5bdff602008-03-11 21:18:06 +00006543 j = 0; /* same as in first pass */
6544 q = u->str; /* next output char */
6545 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006546
6547 for (p = self->str; p < e; p++)
6548 if (*p == '\t') {
6549 if (tabsize > 0) {
6550 i = tabsize - (j % tabsize);
6551 j += i;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006552 while (i--) {
6553 if (q >= qe)
6554 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006555 *q++ = ' ';
Guido van Rossum5bdff602008-03-11 21:18:06 +00006556 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006557 }
6558 }
6559 else {
Guido van Rossum5bdff602008-03-11 21:18:06 +00006560 if (q >= qe)
6561 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006562 *q++ = *p;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006563 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006564 if (*p == '\n' || *p == '\r')
6565 j = 0;
6566 }
6567
6568 return (PyObject*) u;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006569
6570 overflow2:
6571 Py_DECREF(u);
6572 overflow1:
6573 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6574 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006575}
6576
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006577PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006578"S.find(sub [,start [,end]]) -> int\n\
6579\n\
6580Return the lowest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00006581such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006582arguments start and end are interpreted as in slice notation.\n\
6583\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006584Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006585
6586static PyObject *
6587unicode_find(PyUnicodeObject *self, PyObject *args)
6588{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006589 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006590 Py_ssize_t start;
6591 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006592 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006593
Facundo Batista57d56692007-11-16 18:04:14 +00006594 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006595 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006596
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006597 result = stringlib_find_slice(
6598 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6599 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6600 start, end
6601 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006602
6603 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006604
6605 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006606}
6607
6608static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006609unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006610{
6611 if (index < 0 || index >= self->length) {
6612 PyErr_SetString(PyExc_IndexError, "string index out of range");
6613 return NULL;
6614 }
6615
6616 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6617}
6618
6619static long
6620unicode_hash(PyUnicodeObject *self)
6621{
Fredrik Lundhdde61642000-07-10 18:27:47 +00006622 /* Since Unicode objects compare equal to their ASCII string
6623 counterparts, they should use the individual character values
6624 as basis for their hash value. This is needed to assure that
6625 strings and Unicode objects behave in the same way as
6626 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006627
Martin v. Löwis18e16552006-02-15 17:27:45 +00006628 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006629 register Py_UNICODE *p;
6630 register long x;
6631
Guido van Rossumd57fd912000-03-10 22:53:23 +00006632 if (self->hash != -1)
6633 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006634 len = PyUnicode_GET_SIZE(self);
6635 p = PyUnicode_AS_UNICODE(self);
6636 x = *p << 7;
6637 while (--len >= 0)
6638 x = (1000003*x) ^ *p++;
6639 x ^= PyUnicode_GET_SIZE(self);
6640 if (x == -1)
6641 x = -2;
6642 self->hash = x;
6643 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006644}
6645
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006646PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006647"S.index(sub [,start [,end]]) -> int\n\
6648\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006649Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006650
6651static PyObject *
6652unicode_index(PyUnicodeObject *self, PyObject *args)
6653{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006654 Py_ssize_t result;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006655 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006656 Py_ssize_t start;
6657 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006658
Facundo Batista57d56692007-11-16 18:04:14 +00006659 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006660 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006661
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006662 result = stringlib_find_slice(
6663 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6664 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6665 start, end
6666 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006667
6668 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006669
Guido van Rossumd57fd912000-03-10 22:53:23 +00006670 if (result < 0) {
6671 PyErr_SetString(PyExc_ValueError, "substring not found");
6672 return NULL;
6673 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006674
Martin v. Löwis18e16552006-02-15 17:27:45 +00006675 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006676}
6677
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006678PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006679"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006680\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006681Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006682at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006683
6684static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006685unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006686{
6687 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6688 register const Py_UNICODE *e;
6689 int cased;
6690
Guido van Rossumd57fd912000-03-10 22:53:23 +00006691 /* Shortcut for single character strings */
6692 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006693 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006694
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006695 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006696 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006697 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006698
Guido van Rossumd57fd912000-03-10 22:53:23 +00006699 e = p + PyUnicode_GET_SIZE(self);
6700 cased = 0;
6701 for (; p < e; p++) {
6702 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006703
Guido van Rossumd57fd912000-03-10 22:53:23 +00006704 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006705 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006706 else if (!cased && Py_UNICODE_ISLOWER(ch))
6707 cased = 1;
6708 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006709 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006710}
6711
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006712PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006713"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006714\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006715Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006716at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006717
6718static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006719unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006720{
6721 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6722 register const Py_UNICODE *e;
6723 int cased;
6724
Guido van Rossumd57fd912000-03-10 22:53:23 +00006725 /* Shortcut for single character strings */
6726 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006727 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006728
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006729 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006730 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006731 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006732
Guido van Rossumd57fd912000-03-10 22:53:23 +00006733 e = p + PyUnicode_GET_SIZE(self);
6734 cased = 0;
6735 for (; p < e; p++) {
6736 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006737
Guido van Rossumd57fd912000-03-10 22:53:23 +00006738 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006739 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006740 else if (!cased && Py_UNICODE_ISUPPER(ch))
6741 cased = 1;
6742 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006743 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006744}
6745
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006746PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006747"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006748\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006749Return True if S is a titlecased string and there is at least one\n\
6750character in S, i.e. upper- and titlecase characters may only\n\
6751follow uncased characters and lowercase characters only cased ones.\n\
6752Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006753
6754static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006755unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006756{
6757 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6758 register const Py_UNICODE *e;
6759 int cased, previous_is_cased;
6760
Guido van Rossumd57fd912000-03-10 22:53:23 +00006761 /* Shortcut for single character strings */
6762 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006763 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6764 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006765
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006766 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006767 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006768 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006769
Guido van Rossumd57fd912000-03-10 22:53:23 +00006770 e = p + PyUnicode_GET_SIZE(self);
6771 cased = 0;
6772 previous_is_cased = 0;
6773 for (; p < e; p++) {
6774 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006775
Guido van Rossumd57fd912000-03-10 22:53:23 +00006776 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6777 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006778 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006779 previous_is_cased = 1;
6780 cased = 1;
6781 }
6782 else if (Py_UNICODE_ISLOWER(ch)) {
6783 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006784 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006785 previous_is_cased = 1;
6786 cased = 1;
6787 }
6788 else
6789 previous_is_cased = 0;
6790 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006791 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006792}
6793
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006794PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006795"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006796\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006797Return True if all characters in S are whitespace\n\
6798and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006799
6800static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006801unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006802{
6803 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6804 register const Py_UNICODE *e;
6805
Guido van Rossumd57fd912000-03-10 22:53:23 +00006806 /* Shortcut for single character strings */
6807 if (PyUnicode_GET_SIZE(self) == 1 &&
6808 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006809 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006810
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006811 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006812 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006813 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006814
Guido van Rossumd57fd912000-03-10 22:53:23 +00006815 e = p + PyUnicode_GET_SIZE(self);
6816 for (; p < e; p++) {
6817 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006818 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006819 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006820 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006821}
6822
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006823PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006824"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006825\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006826Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006827and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006828
6829static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006830unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006831{
6832 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6833 register const Py_UNICODE *e;
6834
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006835 /* Shortcut for single character strings */
6836 if (PyUnicode_GET_SIZE(self) == 1 &&
6837 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006838 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006839
6840 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006841 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006842 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006843
6844 e = p + PyUnicode_GET_SIZE(self);
6845 for (; p < e; p++) {
6846 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006847 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006848 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006849 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006850}
6851
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006852PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006853"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006854\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006855Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006856and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006857
6858static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006859unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006860{
6861 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6862 register const Py_UNICODE *e;
6863
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006864 /* Shortcut for single character strings */
6865 if (PyUnicode_GET_SIZE(self) == 1 &&
6866 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006867 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006868
6869 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006870 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006871 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006872
6873 e = p + PyUnicode_GET_SIZE(self);
6874 for (; p < e; p++) {
6875 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006876 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006877 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006878 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006879}
6880
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006881PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006882"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006883\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006884Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006885False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006886
6887static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006888unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006889{
6890 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6891 register const Py_UNICODE *e;
6892
Guido van Rossumd57fd912000-03-10 22:53:23 +00006893 /* Shortcut for single character strings */
6894 if (PyUnicode_GET_SIZE(self) == 1 &&
6895 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006896 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006897
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006898 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006899 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006900 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006901
Guido van Rossumd57fd912000-03-10 22:53:23 +00006902 e = p + PyUnicode_GET_SIZE(self);
6903 for (; p < e; p++) {
6904 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006905 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006906 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006907 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006908}
6909
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006910PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006911"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006912\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006913Return True if all characters in S are digits\n\
6914and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006915
6916static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006917unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006918{
6919 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6920 register const Py_UNICODE *e;
6921
Guido van Rossumd57fd912000-03-10 22:53:23 +00006922 /* Shortcut for single character strings */
6923 if (PyUnicode_GET_SIZE(self) == 1 &&
6924 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006925 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006926
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006927 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006928 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006929 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006930
Guido van Rossumd57fd912000-03-10 22:53:23 +00006931 e = p + PyUnicode_GET_SIZE(self);
6932 for (; p < e; p++) {
6933 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006934 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006935 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006936 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006937}
6938
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006939PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006940"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006941\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006942Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006943False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006944
6945static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006946unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006947{
6948 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6949 register const Py_UNICODE *e;
6950
Guido van Rossumd57fd912000-03-10 22:53:23 +00006951 /* Shortcut for single character strings */
6952 if (PyUnicode_GET_SIZE(self) == 1 &&
6953 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006954 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006955
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006956 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006957 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006958 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006959
Guido van Rossumd57fd912000-03-10 22:53:23 +00006960 e = p + PyUnicode_GET_SIZE(self);
6961 for (; p < e; p++) {
6962 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006963 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006964 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006965 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006966}
6967
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006968PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006969"S.join(sequence) -> unicode\n\
6970\n\
6971Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006972sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006973
6974static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006975unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006976{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006977 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006978}
6979
Martin v. Löwis18e16552006-02-15 17:27:45 +00006980static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006981unicode_length(PyUnicodeObject *self)
6982{
6983 return self->length;
6984}
6985
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006986PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006987"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006988\n\
6989Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006990done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006991
6992static PyObject *
6993unicode_ljust(PyUnicodeObject *self, PyObject *args)
6994{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006995 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006996 Py_UNICODE fillchar = ' ';
6997
Martin v. Löwis412fb672006-04-13 06:34:32 +00006998 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006999 return NULL;
7000
Tim Peters7a29bd52001-09-12 03:03:31 +00007001 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007002 Py_INCREF(self);
7003 return (PyObject*) self;
7004 }
7005
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007006 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007007}
7008
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007009PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007010"S.lower() -> unicode\n\
7011\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007012Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007013
7014static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007015unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007016{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007017 return fixup(self, fixlower);
7018}
7019
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007020#define LEFTSTRIP 0
7021#define RIGHTSTRIP 1
7022#define BOTHSTRIP 2
7023
7024/* Arrays indexed by above */
7025static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7026
7027#define STRIPNAME(i) (stripformat[i]+3)
7028
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007029/* externally visible for str.strip(unicode) */
7030PyObject *
7031_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7032{
7033 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007034 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007035 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007036 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7037 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007038
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007039 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7040
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007041 i = 0;
7042 if (striptype != RIGHTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007043 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7044 i++;
7045 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007046 }
7047
7048 j = len;
7049 if (striptype != LEFTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007050 do {
7051 j--;
7052 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7053 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007054 }
7055
7056 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007057 Py_INCREF(self);
7058 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007059 }
7060 else
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007061 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007062}
7063
Guido van Rossumd57fd912000-03-10 22:53:23 +00007064
7065static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007066do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007067{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007068 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007069 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007070
7071 i = 0;
7072 if (striptype != RIGHTSTRIP) {
7073 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7074 i++;
7075 }
7076 }
7077
7078 j = len;
7079 if (striptype != LEFTSTRIP) {
7080 do {
7081 j--;
7082 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7083 j++;
7084 }
7085
7086 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7087 Py_INCREF(self);
7088 return (PyObject*)self;
7089 }
7090 else
7091 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007092}
7093
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007094
7095static PyObject *
7096do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7097{
7098 PyObject *sep = NULL;
7099
7100 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7101 return NULL;
7102
7103 if (sep != NULL && sep != Py_None) {
7104 if (PyUnicode_Check(sep))
7105 return _PyUnicode_XStrip(self, striptype, sep);
7106 else if (PyString_Check(sep)) {
7107 PyObject *res;
7108 sep = PyUnicode_FromObject(sep);
7109 if (sep==NULL)
7110 return NULL;
7111 res = _PyUnicode_XStrip(self, striptype, sep);
7112 Py_DECREF(sep);
7113 return res;
7114 }
7115 else {
7116 PyErr_Format(PyExc_TypeError,
7117 "%s arg must be None, unicode or str",
7118 STRIPNAME(striptype));
7119 return NULL;
7120 }
7121 }
7122
7123 return do_strip(self, striptype);
7124}
7125
7126
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007127PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007128"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007129\n\
7130Return a copy of the string S with leading and trailing\n\
7131whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007132If chars is given and not None, remove characters in chars instead.\n\
7133If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007134
7135static PyObject *
7136unicode_strip(PyUnicodeObject *self, PyObject *args)
7137{
7138 if (PyTuple_GET_SIZE(args) == 0)
7139 return do_strip(self, BOTHSTRIP); /* Common case */
7140 else
7141 return do_argstrip(self, BOTHSTRIP, args);
7142}
7143
7144
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007145PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007146"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007147\n\
7148Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007149If chars is given and not None, remove characters in chars instead.\n\
7150If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007151
7152static PyObject *
7153unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7154{
7155 if (PyTuple_GET_SIZE(args) == 0)
7156 return do_strip(self, LEFTSTRIP); /* Common case */
7157 else
7158 return do_argstrip(self, LEFTSTRIP, args);
7159}
7160
7161
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007162PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007163"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007164\n\
7165Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007166If chars is given and not None, remove characters in chars instead.\n\
7167If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007168
7169static PyObject *
7170unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7171{
7172 if (PyTuple_GET_SIZE(args) == 0)
7173 return do_strip(self, RIGHTSTRIP); /* Common case */
7174 else
7175 return do_argstrip(self, RIGHTSTRIP, args);
7176}
7177
7178
Guido van Rossumd57fd912000-03-10 22:53:23 +00007179static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007180unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007181{
7182 PyUnicodeObject *u;
7183 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007184 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007185 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007186
7187 if (len < 0)
7188 len = 0;
7189
Tim Peters7a29bd52001-09-12 03:03:31 +00007190 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007191 /* no repeat, return original string */
7192 Py_INCREF(str);
7193 return (PyObject*) str;
7194 }
Tim Peters8f422462000-09-09 06:13:41 +00007195
7196 /* ensure # of chars needed doesn't overflow int and # of bytes
7197 * needed doesn't overflow size_t
7198 */
7199 nchars = len * str->length;
7200 if (len && nchars / len != str->length) {
7201 PyErr_SetString(PyExc_OverflowError,
7202 "repeated string is too long");
7203 return NULL;
7204 }
7205 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7206 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7207 PyErr_SetString(PyExc_OverflowError,
7208 "repeated string is too long");
7209 return NULL;
7210 }
7211 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007212 if (!u)
7213 return NULL;
7214
7215 p = u->str;
7216
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007217 if (str->length == 1 && len > 0) {
7218 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007219 } else {
Tim Peters1bacc642006-05-23 05:47:16 +00007220 Py_ssize_t done = 0; /* number of characters copied this far */
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007221 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007222 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007223 done = str->length;
7224 }
7225 while (done < nchars) {
7226 int n = (done <= nchars-done) ? done : nchars-done;
7227 Py_UNICODE_COPY(p+done, p, n);
7228 done += n;
7229 }
7230 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007231
7232 return (PyObject*) u;
7233}
7234
7235PyObject *PyUnicode_Replace(PyObject *obj,
7236 PyObject *subobj,
7237 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007238 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007239{
7240 PyObject *self;
7241 PyObject *str1;
7242 PyObject *str2;
7243 PyObject *result;
7244
7245 self = PyUnicode_FromObject(obj);
7246 if (self == NULL)
7247 return NULL;
7248 str1 = PyUnicode_FromObject(subobj);
7249 if (str1 == NULL) {
7250 Py_DECREF(self);
7251 return NULL;
7252 }
7253 str2 = PyUnicode_FromObject(replobj);
7254 if (str2 == NULL) {
7255 Py_DECREF(self);
7256 Py_DECREF(str1);
7257 return NULL;
7258 }
Tim Petersced69f82003-09-16 20:30:58 +00007259 result = replace((PyUnicodeObject *)self,
7260 (PyUnicodeObject *)str1,
7261 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007262 maxcount);
7263 Py_DECREF(self);
7264 Py_DECREF(str1);
7265 Py_DECREF(str2);
7266 return result;
7267}
7268
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007269PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007270"S.replace (old, new[, maxsplit]) -> unicode\n\
7271\n\
7272Return a copy of S with all occurrences of substring\n\
7273old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007274given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007275
7276static PyObject*
7277unicode_replace(PyUnicodeObject *self, PyObject *args)
7278{
7279 PyUnicodeObject *str1;
7280 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007281 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007282 PyObject *result;
7283
Martin v. Löwis18e16552006-02-15 17:27:45 +00007284 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007285 return NULL;
7286 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7287 if (str1 == NULL)
7288 return NULL;
7289 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007290 if (str2 == NULL) {
7291 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007292 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007293 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007294
7295 result = replace(self, str1, str2, maxcount);
7296
7297 Py_DECREF(str1);
7298 Py_DECREF(str2);
7299 return result;
7300}
7301
7302static
7303PyObject *unicode_repr(PyObject *unicode)
7304{
7305 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
7306 PyUnicode_GET_SIZE(unicode),
7307 1);
7308}
7309
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007310PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007311"S.rfind(sub [,start [,end]]) -> int\n\
7312\n\
7313Return the highest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00007314such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007315arguments start and end are interpreted as in slice notation.\n\
7316\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007317Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007318
7319static PyObject *
7320unicode_rfind(PyUnicodeObject *self, PyObject *args)
7321{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007322 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007323 Py_ssize_t start;
7324 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007325 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007326
Facundo Batista57d56692007-11-16 18:04:14 +00007327 if (!_ParseTupleFinds(args, &substring, &start, &end))
7328 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007329
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007330 result = stringlib_rfind_slice(
7331 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7332 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7333 start, end
7334 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007335
7336 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007337
7338 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007339}
7340
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007341PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007342"S.rindex(sub [,start [,end]]) -> int\n\
7343\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007344Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007345
7346static PyObject *
7347unicode_rindex(PyUnicodeObject *self, PyObject *args)
7348{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007349 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007350 Py_ssize_t start;
7351 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007352 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007353
Facundo Batista57d56692007-11-16 18:04:14 +00007354 if (!_ParseTupleFinds(args, &substring, &start, &end))
7355 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007356
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007357 result = stringlib_rfind_slice(
7358 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7359 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7360 start, end
7361 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007362
7363 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007364
Guido van Rossumd57fd912000-03-10 22:53:23 +00007365 if (result < 0) {
7366 PyErr_SetString(PyExc_ValueError, "substring not found");
7367 return NULL;
7368 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007369 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007370}
7371
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007372PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007373"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007374\n\
7375Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007376done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007377
7378static PyObject *
7379unicode_rjust(PyUnicodeObject *self, PyObject *args)
7380{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007381 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007382 Py_UNICODE fillchar = ' ';
7383
Martin v. Löwis412fb672006-04-13 06:34:32 +00007384 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007385 return NULL;
7386
Tim Peters7a29bd52001-09-12 03:03:31 +00007387 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007388 Py_INCREF(self);
7389 return (PyObject*) self;
7390 }
7391
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007392 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007393}
7394
Guido van Rossumd57fd912000-03-10 22:53:23 +00007395static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007396unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007397{
7398 /* standard clamping */
7399 if (start < 0)
7400 start = 0;
7401 if (end < 0)
7402 end = 0;
7403 if (end > self->length)
7404 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007405 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007406 /* full slice, return original string */
7407 Py_INCREF(self);
7408 return (PyObject*) self;
7409 }
7410 if (start > end)
7411 start = end;
7412 /* copy slice */
7413 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7414 end - start);
7415}
7416
7417PyObject *PyUnicode_Split(PyObject *s,
7418 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007419 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007420{
7421 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007422
Guido van Rossumd57fd912000-03-10 22:53:23 +00007423 s = PyUnicode_FromObject(s);
7424 if (s == NULL)
7425 return NULL;
7426 if (sep != NULL) {
7427 sep = PyUnicode_FromObject(sep);
7428 if (sep == NULL) {
7429 Py_DECREF(s);
7430 return NULL;
7431 }
7432 }
7433
7434 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7435
7436 Py_DECREF(s);
7437 Py_XDECREF(sep);
7438 return result;
7439}
7440
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007441PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007442"S.split([sep [,maxsplit]]) -> list of strings\n\
7443\n\
7444Return a list of the words in S, using sep as the\n\
7445delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00007446splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00007447any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007448
7449static PyObject*
7450unicode_split(PyUnicodeObject *self, PyObject *args)
7451{
7452 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007453 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007454
Martin v. Löwis18e16552006-02-15 17:27:45 +00007455 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007456 return NULL;
7457
7458 if (substring == Py_None)
7459 return split(self, NULL, maxcount);
7460 else if (PyUnicode_Check(substring))
7461 return split(self, (PyUnicodeObject *)substring, maxcount);
7462 else
7463 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7464}
7465
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007466PyObject *
7467PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7468{
7469 PyObject* str_obj;
7470 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007471 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00007472
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007473 str_obj = PyUnicode_FromObject(str_in);
7474 if (!str_obj)
7475 return NULL;
7476 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00007477 if (!sep_obj) {
7478 Py_DECREF(str_obj);
7479 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007480 }
7481
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007482 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00007483 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7484 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7485 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007486
Fredrik Lundhb9479482006-05-26 17:22:38 +00007487 Py_DECREF(sep_obj);
7488 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007489
7490 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007491}
7492
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007493
7494PyObject *
7495PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7496{
7497 PyObject* str_obj;
7498 PyObject* sep_obj;
7499 PyObject* out;
7500
7501 str_obj = PyUnicode_FromObject(str_in);
7502 if (!str_obj)
7503 return NULL;
7504 sep_obj = PyUnicode_FromObject(sep_in);
7505 if (!sep_obj) {
7506 Py_DECREF(str_obj);
7507 return NULL;
7508 }
7509
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007510 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007511 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7512 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7513 );
7514
7515 Py_DECREF(sep_obj);
7516 Py_DECREF(str_obj);
7517
7518 return out;
7519}
7520
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007521PyDoc_STRVAR(partition__doc__,
7522"S.partition(sep) -> (head, sep, tail)\n\
7523\n\
7524Searches for the separator sep in S, and returns the part before it,\n\
7525the separator itself, and the part after it. If the separator is not\n\
7526found, returns S and two empty strings.");
7527
7528static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00007529unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007530{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007531 return PyUnicode_Partition((PyObject *)self, separator);
7532}
7533
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007534PyDoc_STRVAR(rpartition__doc__,
Raymond Hettingera0c95fa2006-09-04 15:32:48 +00007535"S.rpartition(sep) -> (tail, sep, head)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007536\n\
7537Searches for the separator sep in S, starting at the end of S, and returns\n\
7538the part before it, the separator itself, and the part after it. If the\n\
Raymond Hettingera0c95fa2006-09-04 15:32:48 +00007539separator is not found, returns two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007540
7541static PyObject*
7542unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7543{
7544 return PyUnicode_RPartition((PyObject *)self, separator);
7545}
7546
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007547PyObject *PyUnicode_RSplit(PyObject *s,
7548 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007549 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007550{
7551 PyObject *result;
7552
7553 s = PyUnicode_FromObject(s);
7554 if (s == NULL)
7555 return NULL;
7556 if (sep != NULL) {
7557 sep = PyUnicode_FromObject(sep);
7558 if (sep == NULL) {
7559 Py_DECREF(s);
7560 return NULL;
7561 }
7562 }
7563
7564 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7565
7566 Py_DECREF(s);
7567 Py_XDECREF(sep);
7568 return result;
7569}
7570
7571PyDoc_STRVAR(rsplit__doc__,
7572"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7573\n\
7574Return a list of the words in S, using sep as the\n\
7575delimiter string, starting at the end of the string and\n\
7576working to the front. If maxsplit is given, at most maxsplit\n\
7577splits are done. If sep is not specified, any whitespace string\n\
7578is a separator.");
7579
7580static PyObject*
7581unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7582{
7583 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007584 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007585
Martin v. Löwis18e16552006-02-15 17:27:45 +00007586 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007587 return NULL;
7588
7589 if (substring == Py_None)
7590 return rsplit(self, NULL, maxcount);
7591 else if (PyUnicode_Check(substring))
7592 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7593 else
7594 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7595}
7596
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007597PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007598"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007599\n\
7600Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007601Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007602is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007603
7604static PyObject*
7605unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7606{
Guido van Rossum86662912000-04-11 15:38:46 +00007607 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007608
Guido van Rossum86662912000-04-11 15:38:46 +00007609 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007610 return NULL;
7611
Guido van Rossum86662912000-04-11 15:38:46 +00007612 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007613}
7614
7615static
7616PyObject *unicode_str(PyUnicodeObject *self)
7617{
Fred Drakee4315f52000-05-09 19:53:39 +00007618 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007619}
7620
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007621PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007622"S.swapcase() -> unicode\n\
7623\n\
7624Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007625and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007626
7627static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007628unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007629{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007630 return fixup(self, fixswapcase);
7631}
7632
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007633PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007634"S.translate(table) -> unicode\n\
7635\n\
7636Return a copy of the string S, where all characters have been mapped\n\
7637through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007638Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7639Unmapped characters are left untouched. Characters mapped to None\n\
7640are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007641
7642static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007643unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007644{
Tim Petersced69f82003-09-16 20:30:58 +00007645 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007646 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00007647 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007648 "ignore");
7649}
7650
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007651PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007652"S.upper() -> unicode\n\
7653\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007654Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007655
7656static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007657unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007658{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007659 return fixup(self, fixupper);
7660}
7661
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007662PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007663"S.zfill(width) -> unicode\n\
7664\n\
7665Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007666of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007667
7668static PyObject *
7669unicode_zfill(PyUnicodeObject *self, PyObject *args)
7670{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007671 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007672 PyUnicodeObject *u;
7673
Martin v. Löwis18e16552006-02-15 17:27:45 +00007674 Py_ssize_t width;
7675 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007676 return NULL;
7677
7678 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007679 if (PyUnicode_CheckExact(self)) {
7680 Py_INCREF(self);
7681 return (PyObject*) self;
7682 }
7683 else
7684 return PyUnicode_FromUnicode(
7685 PyUnicode_AS_UNICODE(self),
7686 PyUnicode_GET_SIZE(self)
7687 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007688 }
7689
7690 fill = width - self->length;
7691
7692 u = pad(self, fill, 0, '0');
7693
Walter Dörwald068325e2002-04-15 13:36:47 +00007694 if (u == NULL)
7695 return NULL;
7696
Guido van Rossumd57fd912000-03-10 22:53:23 +00007697 if (u->str[fill] == '+' || u->str[fill] == '-') {
7698 /* move sign to beginning of string */
7699 u->str[0] = u->str[fill];
7700 u->str[fill] = '0';
7701 }
7702
7703 return (PyObject*) u;
7704}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007705
7706#if 0
7707static PyObject*
Christian Heimes5b970ad2008-02-06 13:33:44 +00007708free_listsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007709{
Christian Heimes5b970ad2008-02-06 13:33:44 +00007710 return PyInt_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007711}
7712#endif
7713
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007714PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007715"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007716\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007717Return True if S starts with the specified prefix, False otherwise.\n\
7718With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007719With optional end, stop comparing S at that position.\n\
7720prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007721
7722static PyObject *
7723unicode_startswith(PyUnicodeObject *self,
7724 PyObject *args)
7725{
Georg Brandl24250812006-06-09 18:45:48 +00007726 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007727 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007728 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007729 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007730 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007731
Georg Brandl24250812006-06-09 18:45:48 +00007732 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007733 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007734 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007735 if (PyTuple_Check(subobj)) {
7736 Py_ssize_t i;
7737 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7738 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7739 PyTuple_GET_ITEM(subobj, i));
7740 if (substring == NULL)
7741 return NULL;
7742 result = tailmatch(self, substring, start, end, -1);
7743 Py_DECREF(substring);
7744 if (result) {
7745 Py_RETURN_TRUE;
7746 }
7747 }
7748 /* nothing matched */
7749 Py_RETURN_FALSE;
7750 }
7751 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007752 if (substring == NULL)
Georg Brandl24250812006-06-09 18:45:48 +00007753 return NULL;
7754 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007755 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007756 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007757}
7758
7759
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007760PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007761"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007762\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007763Return True if S ends with the specified suffix, False otherwise.\n\
7764With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007765With optional end, stop comparing S at that position.\n\
7766suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007767
7768static PyObject *
7769unicode_endswith(PyUnicodeObject *self,
7770 PyObject *args)
7771{
Georg Brandl24250812006-06-09 18:45:48 +00007772 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007773 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007774 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007775 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007776 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007777
Georg Brandl24250812006-06-09 18:45:48 +00007778 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7779 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007780 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007781 if (PyTuple_Check(subobj)) {
7782 Py_ssize_t i;
7783 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7784 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7785 PyTuple_GET_ITEM(subobj, i));
7786 if (substring == NULL)
7787 return NULL;
7788 result = tailmatch(self, substring, start, end, +1);
7789 Py_DECREF(substring);
7790 if (result) {
7791 Py_RETURN_TRUE;
7792 }
7793 }
7794 Py_RETURN_FALSE;
7795 }
7796 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007797 if (substring == NULL)
Georg Brandl24250812006-06-09 18:45:48 +00007798 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007799
Georg Brandl24250812006-06-09 18:45:48 +00007800 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007801 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007802 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007803}
7804
7805
Eric Smitha9f7d622008-02-17 19:46:49 +00007806/* Implements do_string_format, which is unicode because of stringlib */
7807#include "stringlib/string_format.h"
7808
7809PyDoc_STRVAR(format__doc__,
7810"S.format(*args, **kwargs) -> unicode\n\
7811\n\
7812");
7813
7814PyDoc_STRVAR(p_format__doc__,
7815"S.__format__(format_spec) -> unicode\n\
7816\n\
7817");
7818
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007819
7820static PyObject *
7821unicode_getnewargs(PyUnicodeObject *v)
7822{
7823 return Py_BuildValue("(u#)", v->str, v->length);
7824}
7825
7826
Guido van Rossumd57fd912000-03-10 22:53:23 +00007827static PyMethodDef unicode_methods[] = {
7828
7829 /* Order is according to common usage: often used methods should
7830 appear first, since lookup is done sequentially. */
7831
Georg Brandlecdc0a92006-03-30 12:19:07 +00007832 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007833 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7834 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007835 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007836 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7837 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7838 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7839 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7840 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7841 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7842 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00007843 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007844 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7845 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7846 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007847 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007848 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007849/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7850 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7851 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7852 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007853 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007854 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007855 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007856 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007857 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7858 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7859 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7860 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7861 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7862 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7863 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7864 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7865 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7866 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7867 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7868 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7869 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7870 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007871 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smitha9f7d622008-02-17 19:46:49 +00007872 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7873 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
7874 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
7875 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Walter Dörwald068325e2002-04-15 13:36:47 +00007876#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007877 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007878#endif
7879
7880#if 0
7881 /* This one is just used for debugging the implementation. */
Christian Heimes5b970ad2008-02-06 13:33:44 +00007882 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007883#endif
7884
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007885 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007886 {NULL, NULL}
7887};
7888
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007889static PyObject *
7890unicode_mod(PyObject *v, PyObject *w)
7891{
7892 if (!PyUnicode_Check(v)) {
7893 Py_INCREF(Py_NotImplemented);
7894 return Py_NotImplemented;
7895 }
7896 return PyUnicode_Format(v, w);
7897}
7898
7899static PyNumberMethods unicode_as_number = {
7900 0, /*nb_add*/
7901 0, /*nb_subtract*/
7902 0, /*nb_multiply*/
7903 0, /*nb_divide*/
7904 unicode_mod, /*nb_remainder*/
7905};
7906
Guido van Rossumd57fd912000-03-10 22:53:23 +00007907static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007908 (lenfunc) unicode_length, /* sq_length */
Georg Brandl347b3002006-03-30 11:57:00 +00007909 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007910 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7911 (ssizeargfunc) unicode_getitem, /* sq_item */
7912 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007913 0, /* sq_ass_item */
7914 0, /* sq_ass_slice */
Georg Brandl347b3002006-03-30 11:57:00 +00007915 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007916};
7917
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007918static PyObject*
7919unicode_subscript(PyUnicodeObject* self, PyObject* item)
7920{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00007921 if (PyIndex_Check(item)) {
7922 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007923 if (i == -1 && PyErr_Occurred())
7924 return NULL;
7925 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007926 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007927 return unicode_getitem(self, i);
7928 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007929 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007930 Py_UNICODE* source_buf;
7931 Py_UNICODE* result_buf;
7932 PyObject* result;
7933
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007934 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007935 &start, &stop, &step, &slicelength) < 0) {
7936 return NULL;
7937 }
7938
7939 if (slicelength <= 0) {
7940 return PyUnicode_FromUnicode(NULL, 0);
Thomas Wouters3ccec682007-08-28 15:28:19 +00007941 } else if (start == 0 && step == 1 && slicelength == self->length &&
7942 PyUnicode_CheckExact(self)) {
7943 Py_INCREF(self);
7944 return (PyObject *)self;
7945 } else if (step == 1) {
7946 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007947 } else {
7948 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Neal Norwitz419fd492008-03-17 20:22:43 +00007949 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
7950 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007951
7952 if (result_buf == NULL)
7953 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007954
7955 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7956 result_buf[i] = source_buf[cur];
7957 }
Tim Petersced69f82003-09-16 20:30:58 +00007958
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007959 result = PyUnicode_FromUnicode(result_buf, slicelength);
Neal Norwitz419fd492008-03-17 20:22:43 +00007960 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007961 return result;
7962 }
7963 } else {
7964 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7965 return NULL;
7966 }
7967}
7968
7969static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007970 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007971 (binaryfunc)unicode_subscript, /* mp_subscript */
7972 (objobjargproc)0, /* mp_ass_subscript */
7973};
7974
Martin v. Löwis18e16552006-02-15 17:27:45 +00007975static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007976unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007977 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007978 const void **ptr)
7979{
7980 if (index != 0) {
7981 PyErr_SetString(PyExc_SystemError,
7982 "accessing non-existent unicode segment");
7983 return -1;
7984 }
7985 *ptr = (void *) self->str;
7986 return PyUnicode_GET_DATA_SIZE(self);
7987}
7988
Martin v. Löwis18e16552006-02-15 17:27:45 +00007989static Py_ssize_t
7990unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007991 const void **ptr)
7992{
7993 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00007994 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007995 return -1;
7996}
7997
7998static int
7999unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00008000 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008001{
8002 if (lenp)
8003 *lenp = PyUnicode_GET_DATA_SIZE(self);
8004 return 1;
8005}
8006
Martin v. Löwiseb079f12006-02-16 14:32:27 +00008007static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008008unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00008009 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008010 const void **ptr)
8011{
8012 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00008013
Guido van Rossumd57fd912000-03-10 22:53:23 +00008014 if (index != 0) {
8015 PyErr_SetString(PyExc_SystemError,
8016 "accessing non-existent unicode segment");
8017 return -1;
8018 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008019 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008020 if (str == NULL)
8021 return -1;
8022 *ptr = (void *) PyString_AS_STRING(str);
8023 return PyString_GET_SIZE(str);
8024}
8025
8026/* Helpers for PyUnicode_Format() */
8027
8028static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008029getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008030{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008031 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008032 if (argidx < arglen) {
8033 (*p_argidx)++;
8034 if (arglen < 0)
8035 return args;
8036 else
8037 return PyTuple_GetItem(args, argidx);
8038 }
8039 PyErr_SetString(PyExc_TypeError,
8040 "not enough arguments for format string");
8041 return NULL;
8042}
8043
8044#define F_LJUST (1<<0)
8045#define F_SIGN (1<<1)
8046#define F_BLANK (1<<2)
8047#define F_ALT (1<<3)
8048#define F_ZERO (1<<4)
8049
Martin v. Löwis18e16552006-02-15 17:27:45 +00008050static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008051strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008052{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008053 register Py_ssize_t i;
8054 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008055 for (i = len - 1; i >= 0; i--)
8056 buffer[i] = (Py_UNICODE) charbuffer[i];
8057
Guido van Rossumd57fd912000-03-10 22:53:23 +00008058 return len;
8059}
8060
Neal Norwitzfc76d632006-01-10 06:03:13 +00008061static int
8062doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8063{
Tim Peters15231542006-02-16 01:08:01 +00008064 Py_ssize_t result;
8065
Neal Norwitzfc76d632006-01-10 06:03:13 +00008066 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008067 result = strtounicode(buffer, (char *)buffer);
8068 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008069}
8070
8071static int
8072longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8073{
Tim Peters15231542006-02-16 01:08:01 +00008074 Py_ssize_t result;
8075
Neal Norwitzfc76d632006-01-10 06:03:13 +00008076 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008077 result = strtounicode(buffer, (char *)buffer);
8078 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008079}
8080
Guido van Rossum078151d2002-08-11 04:24:12 +00008081/* XXX To save some code duplication, formatfloat/long/int could have been
8082 shared with stringobject.c, converting from 8-bit to Unicode after the
8083 formatting is done. */
8084
Guido van Rossumd57fd912000-03-10 22:53:23 +00008085static int
8086formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008087 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008088 int flags,
8089 int prec,
8090 int type,
8091 PyObject *v)
8092{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008093 /* fmt = '%#.' + `prec` + `type`
8094 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00008095 char fmt[20];
8096 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008097
Guido van Rossumd57fd912000-03-10 22:53:23 +00008098 x = PyFloat_AsDouble(v);
8099 if (x == -1.0 && PyErr_Occurred())
8100 return -1;
8101 if (prec < 0)
8102 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008103 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8104 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008105 /* Worst case length calc to ensure no buffer overrun:
8106
8107 'g' formats:
8108 fmt = %#.<prec>g
8109 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8110 for any double rep.)
8111 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8112
8113 'f' formats:
8114 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8115 len = 1 + 50 + 1 + prec = 52 + prec
8116
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008117 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008118 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008119
8120 */
Georg Brandl7c3b50d2007-07-12 08:38:00 +00008121 if (((type == 'g' || type == 'G') &&
8122 buflen <= (size_t)10 + (size_t)prec) ||
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008123 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008124 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008125 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008126 return -1;
8127 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008128 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8129 (flags&F_ALT) ? "#" : "",
8130 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008131 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008132}
8133
Tim Peters38fd5b62000-09-21 05:43:11 +00008134static PyObject*
8135formatlong(PyObject *val, int flags, int prec, int type)
8136{
8137 char *buf;
8138 int i, len;
8139 PyObject *str; /* temporary string object. */
8140 PyUnicodeObject *result;
8141
8142 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8143 if (!str)
8144 return NULL;
8145 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008146 if (!result) {
8147 Py_DECREF(str);
8148 return NULL;
8149 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008150 for (i = 0; i < len; i++)
8151 result->str[i] = buf[i];
8152 result->str[len] = 0;
8153 Py_DECREF(str);
8154 return (PyObject*)result;
8155}
8156
Guido van Rossumd57fd912000-03-10 22:53:23 +00008157static int
8158formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008159 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008160 int flags,
8161 int prec,
8162 int type,
8163 PyObject *v)
8164{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008165 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008166 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8167 * + 1 + 1
8168 * = 24
8169 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008170 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008171 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008172 long x;
8173
8174 x = PyInt_AsLong(v);
8175 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008176 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008177 if (x < 0 && type == 'u') {
8178 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008179 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008180 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8181 sign = "-";
8182 else
8183 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008184 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008185 prec = 1;
8186
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008187 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8188 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008189 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008190 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008191 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008192 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008193 return -1;
8194 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008195
8196 if ((flags & F_ALT) &&
8197 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00008198 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008199 * of issues that cause pain:
8200 * - when 0 is being converted, the C standard leaves off
8201 * the '0x' or '0X', which is inconsistent with other
8202 * %#x/%#X conversions and inconsistent with Python's
8203 * hex() function
8204 * - there are platforms that violate the standard and
8205 * convert 0 with the '0x' or '0X'
8206 * (Metrowerks, Compaq Tru64)
8207 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008208 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008209 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008210 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008211 * We can achieve the desired consistency by inserting our
8212 * own '0x' or '0X' prefix, and substituting %x/%X in place
8213 * of %#x/%#X.
8214 *
8215 * Note that this is the same approach as used in
8216 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008217 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008218 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8219 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008220 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008221 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008222 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8223 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008224 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008225 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008226 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008227 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008228 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008229 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008230}
8231
8232static int
8233formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008234 size_t buflen,
8235 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008236{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008237 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008238 if (PyUnicode_Check(v)) {
8239 if (PyUnicode_GET_SIZE(v) != 1)
8240 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008241 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008242 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008243
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008244 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00008245 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008246 goto onError;
8247 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
8248 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008249
8250 else {
8251 /* Integer input truncated to a character */
8252 long x;
8253 x = PyInt_AsLong(v);
8254 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008255 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008256#ifdef Py_UNICODE_WIDE
8257 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008258 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008259 "%c arg not in range(0x110000) "
8260 "(wide Python build)");
8261 return -1;
8262 }
8263#else
8264 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008265 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008266 "%c arg not in range(0x10000) "
8267 "(narrow Python build)");
8268 return -1;
8269 }
8270#endif
8271 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008272 }
8273 buf[1] = '\0';
8274 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008275
8276 onError:
8277 PyErr_SetString(PyExc_TypeError,
8278 "%c requires int or char");
8279 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008280}
8281
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008282/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8283
8284 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8285 chars are formatted. XXX This is a magic number. Each formatting
8286 routine does bounds checking to ensure no overflow, but a better
8287 solution may be to malloc a buffer of appropriate size for each
8288 format. For now, the current solution is sufficient.
8289*/
8290#define FORMATBUFLEN (size_t)120
8291
Guido van Rossumd57fd912000-03-10 22:53:23 +00008292PyObject *PyUnicode_Format(PyObject *format,
8293 PyObject *args)
8294{
8295 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008296 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008297 int args_owned = 0;
8298 PyUnicodeObject *result = NULL;
8299 PyObject *dict = NULL;
8300 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008301
Guido van Rossumd57fd912000-03-10 22:53:23 +00008302 if (format == NULL || args == NULL) {
8303 PyErr_BadInternalCall();
8304 return NULL;
8305 }
8306 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008307 if (uformat == NULL)
8308 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008309 fmt = PyUnicode_AS_UNICODE(uformat);
8310 fmtcnt = PyUnicode_GET_SIZE(uformat);
8311
8312 reslen = rescnt = fmtcnt + 100;
8313 result = _PyUnicode_New(reslen);
8314 if (result == NULL)
8315 goto onError;
8316 res = PyUnicode_AS_UNICODE(result);
8317
8318 if (PyTuple_Check(args)) {
8319 arglen = PyTuple_Size(args);
8320 argidx = 0;
8321 }
8322 else {
8323 arglen = -1;
8324 argidx = -2;
8325 }
Christian Heimese93237d2007-12-19 02:37:44 +00008326 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008327 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008328 dict = args;
8329
8330 while (--fmtcnt >= 0) {
8331 if (*fmt != '%') {
8332 if (--rescnt < 0) {
8333 rescnt = fmtcnt + 100;
8334 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008335 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008336 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008337 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8338 --rescnt;
8339 }
8340 *res++ = *fmt++;
8341 }
8342 else {
8343 /* Got a format specifier */
8344 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008345 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008346 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008347 Py_UNICODE c = '\0';
8348 Py_UNICODE fill;
Facundo Batistac11cecf2008-02-24 03:17:21 +00008349 int isnumok;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008350 PyObject *v = NULL;
8351 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008352 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008353 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008354 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008355 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008356
8357 fmt++;
8358 if (*fmt == '(') {
8359 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008360 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008361 PyObject *key;
8362 int pcount = 1;
8363
8364 if (dict == NULL) {
8365 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008366 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008367 goto onError;
8368 }
8369 ++fmt;
8370 --fmtcnt;
8371 keystart = fmt;
8372 /* Skip over balanced parentheses */
8373 while (pcount > 0 && --fmtcnt >= 0) {
8374 if (*fmt == ')')
8375 --pcount;
8376 else if (*fmt == '(')
8377 ++pcount;
8378 fmt++;
8379 }
8380 keylen = fmt - keystart - 1;
8381 if (fmtcnt < 0 || pcount > 0) {
8382 PyErr_SetString(PyExc_ValueError,
8383 "incomplete format key");
8384 goto onError;
8385 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008386#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008387 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008388 then looked up since Python uses strings to hold
8389 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008390 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008391 key = PyUnicode_EncodeUTF8(keystart,
8392 keylen,
8393 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008394#else
8395 key = PyUnicode_FromUnicode(keystart, keylen);
8396#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008397 if (key == NULL)
8398 goto onError;
8399 if (args_owned) {
8400 Py_DECREF(args);
8401 args_owned = 0;
8402 }
8403 args = PyObject_GetItem(dict, key);
8404 Py_DECREF(key);
8405 if (args == NULL) {
8406 goto onError;
8407 }
8408 args_owned = 1;
8409 arglen = -1;
8410 argidx = -2;
8411 }
8412 while (--fmtcnt >= 0) {
8413 switch (c = *fmt++) {
8414 case '-': flags |= F_LJUST; continue;
8415 case '+': flags |= F_SIGN; continue;
8416 case ' ': flags |= F_BLANK; continue;
8417 case '#': flags |= F_ALT; continue;
8418 case '0': flags |= F_ZERO; continue;
8419 }
8420 break;
8421 }
8422 if (c == '*') {
8423 v = getnextarg(args, arglen, &argidx);
8424 if (v == NULL)
8425 goto onError;
8426 if (!PyInt_Check(v)) {
8427 PyErr_SetString(PyExc_TypeError,
8428 "* wants int");
8429 goto onError;
8430 }
8431 width = PyInt_AsLong(v);
8432 if (width < 0) {
8433 flags |= F_LJUST;
8434 width = -width;
8435 }
8436 if (--fmtcnt >= 0)
8437 c = *fmt++;
8438 }
8439 else if (c >= '0' && c <= '9') {
8440 width = c - '0';
8441 while (--fmtcnt >= 0) {
8442 c = *fmt++;
8443 if (c < '0' || c > '9')
8444 break;
8445 if ((width*10) / 10 != width) {
8446 PyErr_SetString(PyExc_ValueError,
8447 "width too big");
8448 goto onError;
8449 }
8450 width = width*10 + (c - '0');
8451 }
8452 }
8453 if (c == '.') {
8454 prec = 0;
8455 if (--fmtcnt >= 0)
8456 c = *fmt++;
8457 if (c == '*') {
8458 v = getnextarg(args, arglen, &argidx);
8459 if (v == NULL)
8460 goto onError;
8461 if (!PyInt_Check(v)) {
8462 PyErr_SetString(PyExc_TypeError,
8463 "* wants int");
8464 goto onError;
8465 }
8466 prec = PyInt_AsLong(v);
8467 if (prec < 0)
8468 prec = 0;
8469 if (--fmtcnt >= 0)
8470 c = *fmt++;
8471 }
8472 else if (c >= '0' && c <= '9') {
8473 prec = c - '0';
8474 while (--fmtcnt >= 0) {
8475 c = Py_CHARMASK(*fmt++);
8476 if (c < '0' || c > '9')
8477 break;
8478 if ((prec*10) / 10 != prec) {
8479 PyErr_SetString(PyExc_ValueError,
8480 "prec too big");
8481 goto onError;
8482 }
8483 prec = prec*10 + (c - '0');
8484 }
8485 }
8486 } /* prec */
8487 if (fmtcnt >= 0) {
8488 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008489 if (--fmtcnt >= 0)
8490 c = *fmt++;
8491 }
8492 }
8493 if (fmtcnt < 0) {
8494 PyErr_SetString(PyExc_ValueError,
8495 "incomplete format");
8496 goto onError;
8497 }
8498 if (c != '%') {
8499 v = getnextarg(args, arglen, &argidx);
8500 if (v == NULL)
8501 goto onError;
8502 }
8503 sign = 0;
8504 fill = ' ';
8505 switch (c) {
8506
8507 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008508 pbuf = formatbuf;
8509 /* presume that buffer length is at least 1 */
8510 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008511 len = 1;
8512 break;
8513
8514 case 's':
8515 case 'r':
8516 if (PyUnicode_Check(v) && c == 's') {
8517 temp = v;
8518 Py_INCREF(temp);
8519 }
8520 else {
8521 PyObject *unicode;
8522 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008523 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008524 else
8525 temp = PyObject_Repr(v);
8526 if (temp == NULL)
8527 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008528 if (PyUnicode_Check(temp))
8529 /* nothing to do */;
8530 else if (PyString_Check(temp)) {
8531 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00008532 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00008533 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00008534 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008535 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008536 Py_DECREF(temp);
8537 temp = unicode;
8538 if (temp == NULL)
8539 goto onError;
8540 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008541 else {
8542 Py_DECREF(temp);
8543 PyErr_SetString(PyExc_TypeError,
8544 "%s argument has non-string str()");
8545 goto onError;
8546 }
8547 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008548 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008549 len = PyUnicode_GET_SIZE(temp);
8550 if (prec >= 0 && len > prec)
8551 len = prec;
8552 break;
8553
8554 case 'i':
8555 case 'd':
8556 case 'u':
8557 case 'o':
8558 case 'x':
8559 case 'X':
8560 if (c == 'i')
8561 c = 'd';
Facundo Batistac11cecf2008-02-24 03:17:21 +00008562 isnumok = 0;
8563 if (PyNumber_Check(v)) {
8564 PyObject *iobj=NULL;
8565
8566 if (PyInt_Check(v) || (PyLong_Check(v))) {
8567 iobj = v;
8568 Py_INCREF(iobj);
8569 }
8570 else {
8571 iobj = PyNumber_Int(v);
8572 if (iobj==NULL) iobj = PyNumber_Long(v);
8573 }
8574 if (iobj!=NULL) {
8575 if (PyInt_Check(iobj)) {
8576 isnumok = 1;
8577 pbuf = formatbuf;
8578 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8579 flags, prec, c, iobj);
8580 Py_DECREF(iobj);
8581 if (len < 0)
8582 goto onError;
8583 sign = 1;
8584 }
8585 else if (PyLong_Check(iobj)) {
8586 isnumok = 1;
8587 temp = formatlong(iobj, flags, prec, c);
8588 Py_DECREF(iobj);
8589 if (!temp)
8590 goto onError;
8591 pbuf = PyUnicode_AS_UNICODE(temp);
8592 len = PyUnicode_GET_SIZE(temp);
8593 sign = 1;
8594 }
8595 else {
8596 Py_DECREF(iobj);
8597 }
8598 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008599 }
Facundo Batistac11cecf2008-02-24 03:17:21 +00008600 if (!isnumok) {
8601 PyErr_Format(PyExc_TypeError,
8602 "%%%c format: a number is required, "
8603 "not %.200s", c, Py_TYPE(v)->tp_name);
Tim Peters38fd5b62000-09-21 05:43:11 +00008604 goto onError;
Tim Peters38fd5b62000-09-21 05:43:11 +00008605 }
8606 if (flags & F_ZERO)
8607 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008608 break;
8609
8610 case 'e':
8611 case 'E':
8612 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008613 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008614 case 'g':
8615 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008616 if (c == 'F')
8617 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008618 pbuf = formatbuf;
8619 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8620 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008621 if (len < 0)
8622 goto onError;
8623 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008624 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008625 fill = '0';
8626 break;
8627
8628 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008629 pbuf = formatbuf;
8630 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008631 if (len < 0)
8632 goto onError;
8633 break;
8634
8635 default:
8636 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008637 "unsupported format character '%c' (0x%x) "
Armin Rigo7ccbca92006-10-04 12:17:45 +00008638 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008639 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008640 (int)c,
Armin Rigo7ccbca92006-10-04 12:17:45 +00008641 (Py_ssize_t)(fmt - 1 -
8642 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008643 goto onError;
8644 }
8645 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008646 if (*pbuf == '-' || *pbuf == '+') {
8647 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008648 len--;
8649 }
8650 else if (flags & F_SIGN)
8651 sign = '+';
8652 else if (flags & F_BLANK)
8653 sign = ' ';
8654 else
8655 sign = 0;
8656 }
8657 if (width < len)
8658 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008659 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008660 reslen -= rescnt;
8661 rescnt = width + fmtcnt + 100;
8662 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008663 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008664 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008665 PyErr_NoMemory();
8666 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008667 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008668 if (_PyUnicode_Resize(&result, reslen) < 0) {
8669 Py_XDECREF(temp);
8670 goto onError;
8671 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008672 res = PyUnicode_AS_UNICODE(result)
8673 + reslen - rescnt;
8674 }
8675 if (sign) {
8676 if (fill != ' ')
8677 *res++ = sign;
8678 rescnt--;
8679 if (width > len)
8680 width--;
8681 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008682 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8683 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008684 assert(pbuf[1] == c);
8685 if (fill != ' ') {
8686 *res++ = *pbuf++;
8687 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008688 }
Tim Petersfff53252001-04-12 18:38:48 +00008689 rescnt -= 2;
8690 width -= 2;
8691 if (width < 0)
8692 width = 0;
8693 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008694 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008695 if (width > len && !(flags & F_LJUST)) {
8696 do {
8697 --rescnt;
8698 *res++ = fill;
8699 } while (--width > len);
8700 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008701 if (fill == ' ') {
8702 if (sign)
8703 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00008704 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008705 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008706 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008707 *res++ = *pbuf++;
8708 *res++ = *pbuf++;
8709 }
8710 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008711 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008712 res += len;
8713 rescnt -= len;
8714 while (--width >= len) {
8715 --rescnt;
8716 *res++ = ' ';
8717 }
8718 if (dict && (argidx < arglen) && c != '%') {
8719 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008720 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008721 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008722 goto onError;
8723 }
8724 Py_XDECREF(temp);
8725 } /* '%' */
8726 } /* until end */
8727 if (argidx < arglen && !dict) {
8728 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008729 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008730 goto onError;
8731 }
8732
Thomas Woutersa96affe2006-03-12 00:29:36 +00008733 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8734 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008735 if (args_owned) {
8736 Py_DECREF(args);
8737 }
8738 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008739 return (PyObject *)result;
8740
8741 onError:
8742 Py_XDECREF(result);
8743 Py_DECREF(uformat);
8744 if (args_owned) {
8745 Py_DECREF(args);
8746 }
8747 return NULL;
8748}
8749
8750static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008751 (readbufferproc) unicode_buffer_getreadbuf,
8752 (writebufferproc) unicode_buffer_getwritebuf,
8753 (segcountproc) unicode_buffer_getsegcount,
8754 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008755};
8756
Jeremy Hylton938ace62002-07-17 16:30:39 +00008757static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008758unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8759
Tim Peters6d6c1a32001-08-02 04:15:00 +00008760static PyObject *
8761unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8762{
8763 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00008764 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008765 char *encoding = NULL;
8766 char *errors = NULL;
8767
Guido van Rossume023fe02001-08-30 03:12:59 +00008768 if (type != &PyUnicode_Type)
8769 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008770 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8771 kwlist, &x, &encoding, &errors))
8772 return NULL;
8773 if (x == NULL)
8774 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008775 if (encoding == NULL && errors == NULL)
8776 return PyObject_Unicode(x);
8777 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008778 return PyUnicode_FromEncodedObject(x, encoding, errors);
8779}
8780
Guido van Rossume023fe02001-08-30 03:12:59 +00008781static PyObject *
8782unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8783{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008784 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008785 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008786
8787 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8788 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8789 if (tmp == NULL)
8790 return NULL;
8791 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008792 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008793 if (pnew == NULL) {
8794 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008795 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008796 }
Neal Norwitz419fd492008-03-17 20:22:43 +00008797 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008798 if (pnew->str == NULL) {
8799 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008800 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008801 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008802 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008803 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008804 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8805 pnew->length = n;
8806 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008807 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008808 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008809}
8810
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008811PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00008812"unicode(string [, encoding[, errors]]) -> object\n\
8813\n\
8814Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008815encoding defaults to the current default string encoding.\n\
8816errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008817
Guido van Rossumd57fd912000-03-10 22:53:23 +00008818PyTypeObject PyUnicode_Type = {
Martin v. Löwis68192102007-07-21 06:55:02 +00008819 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008820 "unicode", /* tp_name */
8821 sizeof(PyUnicodeObject), /* tp_size */
8822 0, /* tp_itemsize */
8823 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008824 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008825 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008826 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008827 0, /* tp_setattr */
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00008828 0, /* tp_compare */
Georg Brandl347b3002006-03-30 11:57:00 +00008829 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008830 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008831 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008832 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008833 (hashfunc) unicode_hash, /* tp_hash*/
8834 0, /* tp_call*/
8835 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008836 PyObject_GenericGetAttr, /* tp_getattro */
8837 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008838 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008839 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
Neal Norwitzee3a1b52007-02-25 19:44:48 +00008840 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008841 unicode_doc, /* tp_doc */
8842 0, /* tp_traverse */
8843 0, /* tp_clear */
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00008844 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008845 0, /* tp_weaklistoffset */
8846 0, /* tp_iter */
8847 0, /* tp_iternext */
8848 unicode_methods, /* tp_methods */
8849 0, /* tp_members */
8850 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008851 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008852 0, /* tp_dict */
8853 0, /* tp_descr_get */
8854 0, /* tp_descr_set */
8855 0, /* tp_dictoffset */
8856 0, /* tp_init */
8857 0, /* tp_alloc */
8858 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008859 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008860};
8861
8862/* Initialize the Unicode implementation */
8863
Thomas Wouters78890102000-07-22 19:25:51 +00008864void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008865{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008866 int i;
8867
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008868 /* XXX - move this array to unicodectype.c ? */
8869 Py_UNICODE linebreak[] = {
8870 0x000A, /* LINE FEED */
8871 0x000D, /* CARRIAGE RETURN */
8872 0x001C, /* FILE SEPARATOR */
8873 0x001D, /* GROUP SEPARATOR */
8874 0x001E, /* RECORD SEPARATOR */
8875 0x0085, /* NEXT LINE */
8876 0x2028, /* LINE SEPARATOR */
8877 0x2029, /* PARAGRAPH SEPARATOR */
8878 };
8879
Fred Drakee4315f52000-05-09 19:53:39 +00008880 /* Init the implementation */
Christian Heimes5b970ad2008-02-06 13:33:44 +00008881 free_list = NULL;
8882 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008883 unicode_empty = _PyUnicode_New(0);
Neal Norwitze1fdb322006-07-21 05:32:28 +00008884 if (!unicode_empty)
8885 return;
8886
Marc-André Lemburg90e81472000-06-07 09:13:21 +00008887 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008888 for (i = 0; i < 256; i++)
8889 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008890 if (PyType_Ready(&PyUnicode_Type) < 0)
8891 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008892
8893 /* initialize the linebreak bloom filter */
8894 bloom_linebreak = make_bloom_mask(
8895 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8896 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00008897
8898 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008899}
8900
8901/* Finalize the Unicode implementation */
8902
Christian Heimes3b718a72008-02-14 12:47:33 +00008903int
8904PyUnicode_ClearFreeList(void)
8905{
8906 int freelist_size = numfree;
8907 PyUnicodeObject *u;
8908
8909 for (u = free_list; u != NULL;) {
8910 PyUnicodeObject *v = u;
8911 u = *(PyUnicodeObject **)u;
8912 if (v->str)
Neal Norwitz419fd492008-03-17 20:22:43 +00008913 PyObject_DEL(v->str);
Christian Heimes3b718a72008-02-14 12:47:33 +00008914 Py_XDECREF(v->defenc);
8915 PyObject_Del(v);
8916 numfree--;
8917 }
8918 free_list = NULL;
8919 assert(numfree == 0);
8920 return freelist_size;
8921}
8922
Guido van Rossumd57fd912000-03-10 22:53:23 +00008923void
Thomas Wouters78890102000-07-22 19:25:51 +00008924_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008925{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008926 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008927
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008928 Py_XDECREF(unicode_empty);
8929 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008930
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008931 for (i = 0; i < 256; i++) {
8932 if (unicode_latin1[i]) {
8933 Py_DECREF(unicode_latin1[i]);
8934 unicode_latin1[i] = NULL;
8935 }
8936 }
Christian Heimes3b718a72008-02-14 12:47:33 +00008937 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00008938}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008939
Anthony Baxterac6bd462006-04-13 02:06:09 +00008940#ifdef __cplusplus
8941}
8942#endif
8943
8944
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008945/*
8946Local variables:
8947c-basic-offset: 4
8948indent-tabs-mode: nil
8949End:
8950*/