blob: c008bd6996af1010f14888556ae89271f4cc07a2 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Eric Smitha9f7d622008-02-17 19:46:49 +000045#include "formatter_unicode.h"
46
Guido van Rossumd57fd912000-03-10 22:53:23 +000047#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000048#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000049
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000050#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000051#include <windows.h>
52#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000053
Guido van Rossumd57fd912000-03-10 22:53:23 +000054/* Limit for the Unicode object free list */
55
Christian Heimes5b970ad2008-02-06 13:33:44 +000056#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000057
58/* Limit for the Unicode object free list stay alive optimization.
59
60 The implementation will keep allocated Unicode memory intact for
61 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000062 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000063
Christian Heimes5b970ad2008-02-06 13:33:44 +000064 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000065 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000066 malloc()-overhead) bytes of unused garbage.
67
68 Setting the limit to 0 effectively turns the feature off.
69
Guido van Rossumfd4b9572000-04-10 13:51:10 +000070 Note: This is an experimental feature ! If you get core dumps when
71 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000072
73*/
74
Guido van Rossumfd4b9572000-04-10 13:51:10 +000075#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000076
77/* Endianness switches; defaults to little endian */
78
79#ifdef WORDS_BIGENDIAN
80# define BYTEORDER_IS_BIG_ENDIAN
81#else
82# define BYTEORDER_IS_LITTLE_ENDIAN
83#endif
84
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000085/* --- Globals ------------------------------------------------------------
86
87 The globals are initialized by the _PyUnicode_Init() API and should
88 not be used before calling that API.
89
90*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000091
Anthony Baxterac6bd462006-04-13 02:06:09 +000092
93#ifdef __cplusplus
94extern "C" {
95#endif
96
Guido van Rossumd57fd912000-03-10 22:53:23 +000097/* Free list for Unicode objects */
Christian Heimes5b970ad2008-02-06 13:33:44 +000098static PyUnicodeObject *free_list;
99static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000100
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000101/* The empty Unicode object is shared to improve performance. */
102static PyUnicodeObject *unicode_empty;
103
104/* Single character Unicode strings in the Latin-1 range are being
105 shared as well. */
106static PyUnicodeObject *unicode_latin1[256];
107
Fred Drakee4315f52000-05-09 19:53:39 +0000108/* Default encoding to use and assume when NULL is passed as encoding
109 parameter; it is initialized by _PyUnicode_Init().
110
111 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000112 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000113
114*/
Fred Drakee4315f52000-05-09 19:53:39 +0000115static char unicode_default_encoding[100];
116
Christian Heimes4d4f2702008-01-30 11:32:37 +0000117/* Fast detection of the most frequent whitespace characters */
118const unsigned char _Py_ascii_whitespace[] = {
119 0, 0, 0, 0, 0, 0, 0, 0,
120// case 0x0009: /* HORIZONTAL TABULATION */
121// case 0x000A: /* LINE FEED */
122// case 0x000B: /* VERTICAL TABULATION */
123// case 0x000C: /* FORM FEED */
124// case 0x000D: /* CARRIAGE RETURN */
125 0, 1, 1, 1, 1, 1, 0, 0,
126 0, 0, 0, 0, 0, 0, 0, 0,
127// case 0x001C: /* FILE SEPARATOR */
128// case 0x001D: /* GROUP SEPARATOR */
129// case 0x001E: /* RECORD SEPARATOR */
130// case 0x001F: /* UNIT SEPARATOR */
131 0, 0, 0, 0, 1, 1, 1, 1,
132// case 0x0020: /* SPACE */
133 1, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, 0, 0, 0, 0,
137
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0
146};
147
148/* Same for linebreaks */
149static unsigned char ascii_linebreak[] = {
150 0, 0, 0, 0, 0, 0, 0, 0,
151// 0x000A, /* LINE FEED */
152// 0x000D, /* CARRIAGE RETURN */
153 0, 0, 1, 0, 0, 1, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0,
155// 0x001C, /* FILE SEPARATOR */
156// 0x001D, /* GROUP SEPARATOR */
157// 0x001E, /* RECORD SEPARATOR */
158 0, 0, 0, 0, 1, 1, 1, 0,
159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
161 0, 0, 0, 0, 0, 0, 0, 0,
162 0, 0, 0, 0, 0, 0, 0, 0,
163
164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0
172};
173
174
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000175Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000176PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000177{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000178#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000179 return 0x10FFFF;
180#else
181 /* This is actually an illegal character, so it should
182 not be passed to unichr. */
183 return 0xFFFF;
184#endif
185}
186
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000187/* --- Bloom Filters ----------------------------------------------------- */
188
189/* stuff to implement simple "bloom filters" for Unicode characters.
190 to keep things simple, we use a single bitmask, using the least 5
191 bits from each unicode characters as the bit index. */
192
193/* the linebreak mask is set up by Unicode_Init below */
194
195#define BLOOM_MASK unsigned long
196
197static BLOOM_MASK bloom_linebreak;
198
199#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
200
Christian Heimes4d4f2702008-01-30 11:32:37 +0000201#define BLOOM_LINEBREAK(ch) \
202 ((ch) < 128U ? ascii_linebreak[(ch)] : \
203 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000204
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000205Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000206{
207 /* calculate simple bloom-style bitmask for a given unicode string */
208
209 long mask;
210 Py_ssize_t i;
211
212 mask = 0;
213 for (i = 0; i < len; i++)
214 mask |= (1 << (ptr[i] & 0x1F));
215
216 return mask;
217}
218
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000219Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000220{
221 Py_ssize_t i;
222
223 for (i = 0; i < setlen; i++)
224 if (set[i] == chr)
225 return 1;
226
Fredrik Lundh77633512006-05-23 19:47:35 +0000227 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000228}
229
230#define BLOOM_MEMBER(mask, chr, set, setlen)\
231 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
232
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233/* --- Unicode Object ----------------------------------------------------- */
234
235static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000236int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000237 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000238{
239 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000240
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000241 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000243 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000244
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000245 /* Resizing shared object (unicode_empty or single character
246 objects) in-place is not allowed. Use PyUnicode_Resize()
247 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000248
Martin v. Löwis80d2e592006-04-13 06:06:08 +0000249 if (unicode == unicode_empty ||
250 (unicode->length == 1 &&
251 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000252 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000254 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000255 return -1;
256 }
257
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000258 /* We allocate one more byte to make sure the string is Ux0000 terminated.
259 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000260 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000261 it contains). */
262
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 oldstr = unicode->str;
Neal Norwitz419fd492008-03-17 20:22:43 +0000264 unicode->str = PyObject_REALLOC(unicode->str,
265 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 if (!unicode->str) {
Anthony Baxtera6286212006-04-11 07:42:36 +0000267 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000268 PyErr_NoMemory();
269 return -1;
270 }
271 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000272 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000273
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000274 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000275 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000276 if (unicode->defenc) {
277 Py_DECREF(unicode->defenc);
278 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000279 }
280 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000281
Guido van Rossumd57fd912000-03-10 22:53:23 +0000282 return 0;
283}
284
285/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000286 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000287
288 XXX This allocator could further be enhanced by assuring that the
289 free list never reduces its size below 1.
290
291*/
292
293static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000294PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000295{
296 register PyUnicodeObject *unicode;
297
Andrew Dalkee0df7622006-05-27 11:04:36 +0000298 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299 if (length == 0 && unicode_empty != NULL) {
300 Py_INCREF(unicode_empty);
301 return unicode_empty;
302 }
303
304 /* Unicode freelist & memory allocation */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000305 if (free_list) {
306 unicode = free_list;
307 free_list = *(PyUnicodeObject **)unicode;
308 numfree--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000309 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000310 /* Keep-Alive optimization: we only upsize the buffer,
311 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000312 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000313 unicode_resize(unicode, length) < 0) {
Neal Norwitz419fd492008-03-17 20:22:43 +0000314 PyObject_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000315 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000316 }
317 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000318 else {
Neal Norwitz419fd492008-03-17 20:22:43 +0000319 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
320 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000321 }
322 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000323 }
324 else {
Neal Norwitz419fd492008-03-17 20:22:43 +0000325 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000326 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000327 if (unicode == NULL)
328 return NULL;
Neal Norwitz419fd492008-03-17 20:22:43 +0000329 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
330 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000331 }
332
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000333 if (!unicode->str) {
334 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000335 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000336 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000337 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000338 * the caller fails before initializing str -- unicode_resize()
339 * reads str[0], and the Keep-Alive optimization can keep memory
340 * allocated for str alive across a call to unicode_dealloc(unicode).
341 * We don't want unicode_resize to read uninitialized memory in
342 * that case.
343 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000344 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000345 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000346 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000347 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000348 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000349 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000350
351 onError:
352 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000353 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000354 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000355}
356
357static
Guido van Rossum9475a232001-10-05 20:51:39 +0000358void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000359{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000360 if (PyUnicode_CheckExact(unicode) &&
Christian Heimes5b970ad2008-02-06 13:33:44 +0000361 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000362 /* Keep-Alive optimization */
363 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Neal Norwitz419fd492008-03-17 20:22:43 +0000364 PyObject_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000365 unicode->str = NULL;
366 unicode->length = 0;
367 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000368 if (unicode->defenc) {
369 Py_DECREF(unicode->defenc);
370 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000371 }
372 /* Add to free list */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000373 *(PyUnicodeObject **)unicode = free_list;
374 free_list = unicode;
375 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000376 }
377 else {
Neal Norwitz419fd492008-03-17 20:22:43 +0000378 PyObject_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000379 Py_XDECREF(unicode->defenc);
Christian Heimese93237d2007-12-19 02:37:44 +0000380 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000381 }
382}
383
Martin v. Löwis18e16552006-02-15 17:27:45 +0000384int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000385{
386 register PyUnicodeObject *v;
387
388 /* Argument checks */
389 if (unicode == NULL) {
390 PyErr_BadInternalCall();
391 return -1;
392 }
393 v = (PyUnicodeObject *)*unicode;
Christian Heimese93237d2007-12-19 02:37:44 +0000394 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000395 PyErr_BadInternalCall();
396 return -1;
397 }
398
399 /* Resizing unicode_empty and single character objects is not
400 possible since these are being shared. We simply return a fresh
401 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000402 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000403 (v == unicode_empty || v->length == 1)) {
404 PyUnicodeObject *w = _PyUnicode_New(length);
405 if (w == NULL)
406 return -1;
407 Py_UNICODE_COPY(w->str, v->str,
408 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000409 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000410 *unicode = (PyObject *)w;
411 return 0;
412 }
413
414 /* Note that we don't have to modify *unicode for unshared Unicode
415 objects, since we can modify them in-place. */
416 return unicode_resize(v, length);
417}
418
419/* Internal API for use in unicodeobject.c only ! */
420#define _PyUnicode_Resize(unicodevar, length) \
421 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
422
Guido van Rossumd57fd912000-03-10 22:53:23 +0000423PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000424 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000425{
426 PyUnicodeObject *unicode;
427
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000428 /* If the Unicode data is known at construction time, we can apply
429 some optimizations which share commonly used objects. */
430 if (u != NULL) {
431
432 /* Optimization for empty strings */
433 if (size == 0 && unicode_empty != NULL) {
434 Py_INCREF(unicode_empty);
435 return (PyObject *)unicode_empty;
436 }
437
438 /* Single character Unicode objects in the Latin-1 range are
439 shared when using this constructor */
440 if (size == 1 && *u < 256) {
441 unicode = unicode_latin1[*u];
442 if (!unicode) {
443 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000444 if (!unicode)
445 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000446 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000447 unicode_latin1[*u] = unicode;
448 }
449 Py_INCREF(unicode);
450 return (PyObject *)unicode;
451 }
452 }
Tim Petersced69f82003-09-16 20:30:58 +0000453
Guido van Rossumd57fd912000-03-10 22:53:23 +0000454 unicode = _PyUnicode_New(size);
455 if (!unicode)
456 return NULL;
457
458 /* Copy the Unicode data into the new object */
459 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000460 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000461
462 return (PyObject *)unicode;
463}
464
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000465PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
466{
467 PyUnicodeObject *unicode;
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000468
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000469 if (size < 0) {
470 PyErr_SetString(PyExc_SystemError,
471 "Negative size passed to PyUnicode_FromStringAndSize");
472 return NULL;
473 }
474
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000475 /* If the Unicode data is known at construction time, we can apply
476 some optimizations which share commonly used objects.
477 Also, this means the input must be UTF-8, so fall back to the
478 UTF-8 decoder at the end. */
479 if (u != NULL) {
480
481 /* Optimization for empty strings */
482 if (size == 0 && unicode_empty != NULL) {
483 Py_INCREF(unicode_empty);
484 return (PyObject *)unicode_empty;
485 }
486
487 /* Single characters are shared when using this constructor.
488 Restrict to ASCII, since the input must be UTF-8. */
489 if (size == 1 && Py_CHARMASK(*u) < 128) {
Neal Norwitzd183bdd2008-03-28 04:58:51 +0000490 unicode = unicode_latin1[Py_CHARMASK(*u)];
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000491 if (!unicode) {
492 unicode = _PyUnicode_New(1);
493 if (!unicode)
494 return NULL;
495 unicode->str[0] = Py_CHARMASK(*u);
Neal Norwitzd183bdd2008-03-28 04:58:51 +0000496 unicode_latin1[Py_CHARMASK(*u)] = unicode;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000497 }
498 Py_INCREF(unicode);
499 return (PyObject *)unicode;
500 }
501
502 return PyUnicode_DecodeUTF8(u, size, NULL);
503 }
504
505 unicode = _PyUnicode_New(size);
506 if (!unicode)
507 return NULL;
508
509 return (PyObject *)unicode;
510}
511
512PyObject *PyUnicode_FromString(const char *u)
513{
514 size_t size = strlen(u);
515 if (size > PY_SSIZE_T_MAX) {
516 PyErr_SetString(PyExc_OverflowError, "input too long");
517 return NULL;
518 }
519
520 return PyUnicode_FromStringAndSize(u, size);
521}
522
Guido van Rossumd57fd912000-03-10 22:53:23 +0000523#ifdef HAVE_WCHAR_H
524
525PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000526 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000527{
528 PyUnicodeObject *unicode;
529
530 if (w == NULL) {
531 PyErr_BadInternalCall();
532 return NULL;
533 }
534
535 unicode = _PyUnicode_New(size);
536 if (!unicode)
537 return NULL;
538
539 /* Copy the wchar_t data into the new object */
540#ifdef HAVE_USABLE_WCHAR_T
541 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000542#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000543 {
544 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000545 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000546 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000547 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000548 *u++ = *w++;
549 }
550#endif
551
552 return (PyObject *)unicode;
553}
554
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000555static void
556makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
557{
558 *fmt++ = '%';
559 if (width) {
560 if (zeropad)
561 *fmt++ = '0';
562 fmt += sprintf(fmt, "%d", width);
563 }
564 if (precision)
565 fmt += sprintf(fmt, ".%d", precision);
566 if (longflag)
567 *fmt++ = 'l';
568 else if (size_tflag) {
569 char *f = PY_FORMAT_SIZE_T;
570 while (*f)
571 *fmt++ = *f++;
572 }
573 *fmt++ = c;
574 *fmt = '\0';
575}
576
577#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
578
579PyObject *
580PyUnicode_FromFormatV(const char *format, va_list vargs)
581{
582 va_list count;
583 Py_ssize_t callcount = 0;
584 PyObject **callresults = NULL;
585 PyObject **callresult = NULL;
586 Py_ssize_t n = 0;
587 int width = 0;
588 int precision = 0;
589 int zeropad;
590 const char* f;
591 Py_UNICODE *s;
592 PyObject *string;
593 /* used by sprintf */
594 char buffer[21];
595 /* use abuffer instead of buffer, if we need more space
596 * (which can happen if there's a format specifier with width). */
597 char *abuffer = NULL;
598 char *realbuffer;
599 Py_ssize_t abuffersize = 0;
600 char fmt[60]; /* should be enough for %0width.precisionld */
601 const char *copy;
602
603#ifdef VA_LIST_IS_ARRAY
604 Py_MEMCPY(count, vargs, sizeof(va_list));
605#else
606#ifdef __va_copy
607 __va_copy(count, vargs);
608#else
609 count = vargs;
610#endif
611#endif
612 /* step 1: count the number of %S/%R format specifications
613 * (we call PyObject_Str()/PyObject_Repr() for these objects
614 * once during step 3 and put the result in an array) */
615 for (f = format; *f; f++) {
616 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
617 ++callcount;
618 }
619 /* step 2: allocate memory for the results of
620 * PyObject_Str()/PyObject_Repr() calls */
621 if (callcount) {
Neal Norwitz419fd492008-03-17 20:22:43 +0000622 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000623 if (!callresults) {
624 PyErr_NoMemory();
625 return NULL;
626 }
627 callresult = callresults;
628 }
629 /* step 3: figure out how large a buffer we need */
630 for (f = format; *f; f++) {
631 if (*f == '%') {
632 const char* p = f;
633 width = 0;
Neal Norwitzade57d02008-03-23 06:19:57 +0000634 while (isdigit((unsigned)*f))
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000635 width = (width*10) + *f++ - '0';
Neal Norwitzade57d02008-03-23 06:19:57 +0000636 while (*++f && *f != '%' && !isalpha((unsigned)*f))
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000637 ;
638
639 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
640 * they don't affect the amount of space we reserve.
641 */
642 if ((*f == 'l' || *f == 'z') &&
643 (f[1] == 'd' || f[1] == 'u'))
644 ++f;
645
646 switch (*f) {
647 case 'c':
648 (void)va_arg(count, int);
649 /* fall through... */
650 case '%':
651 n++;
652 break;
653 case 'd': case 'u': case 'i': case 'x':
654 (void) va_arg(count, int);
655 /* 20 bytes is enough to hold a 64-bit
656 integer. Decimal takes the most space.
657 This isn't enough for octal.
658 If a width is specified we need more
659 (which we allocate later). */
660 if (width < 20)
661 width = 20;
662 n += width;
663 if (abuffersize < width)
664 abuffersize = width;
665 break;
666 case 's':
667 {
668 /* UTF-8 */
669 unsigned char*s;
670 s = va_arg(count, unsigned char*);
671 while (*s) {
672 if (*s < 128) {
673 n++; s++;
674 } else if (*s < 0xc0) {
675 /* invalid UTF-8 */
676 n++; s++;
677 } else if (*s < 0xc0) {
678 n++;
679 s++; if(!*s)break;
680 s++;
681 } else if (*s < 0xe0) {
682 n++;
683 s++; if(!*s)break;
684 s++; if(!*s)break;
685 s++;
686 } else {
687 #ifdef Py_UNICODE_WIDE
688 n++;
689 #else
690 n+=2;
691 #endif
692 s++; if(!*s)break;
693 s++; if(!*s)break;
694 s++; if(!*s)break;
695 s++;
696 }
697 }
698 break;
699 }
700 case 'U':
701 {
702 PyObject *obj = va_arg(count, PyObject *);
703 assert(obj && PyUnicode_Check(obj));
704 n += PyUnicode_GET_SIZE(obj);
705 break;
706 }
707 case 'V':
708 {
709 PyObject *obj = va_arg(count, PyObject *);
710 const char *str = va_arg(count, const char *);
711 assert(obj || str);
712 assert(!obj || PyUnicode_Check(obj));
713 if (obj)
714 n += PyUnicode_GET_SIZE(obj);
715 else
716 n += strlen(str);
717 break;
718 }
719 case 'S':
720 {
721 PyObject *obj = va_arg(count, PyObject *);
722 PyObject *str;
723 assert(obj);
724 str = PyObject_Str(obj);
725 if (!str)
726 goto fail;
727 n += PyUnicode_GET_SIZE(str);
728 /* Remember the str and switch to the next slot */
729 *callresult++ = str;
730 break;
731 }
732 case 'R':
733 {
734 PyObject *obj = va_arg(count, PyObject *);
735 PyObject *repr;
736 assert(obj);
737 repr = PyObject_Repr(obj);
738 if (!repr)
739 goto fail;
740 n += PyUnicode_GET_SIZE(repr);
741 /* Remember the repr and switch to the next slot */
742 *callresult++ = repr;
743 break;
744 }
745 case 'p':
746 (void) va_arg(count, int);
747 /* maximum 64-bit pointer representation:
748 * 0xffffffffffffffff
749 * so 19 characters is enough.
750 * XXX I count 18 -- what's the extra for?
751 */
752 n += 19;
753 break;
754 default:
755 /* if we stumble upon an unknown
756 formatting code, copy the rest of
757 the format string to the output
758 string. (we cannot just skip the
759 code, since there's no way to know
760 what's in the argument list) */
761 n += strlen(p);
762 goto expand;
763 }
764 } else
765 n++;
766 }
767 expand:
768 if (abuffersize > 20) {
Neal Norwitz419fd492008-03-17 20:22:43 +0000769 abuffer = PyObject_Malloc(abuffersize);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000770 if (!abuffer) {
771 PyErr_NoMemory();
772 goto fail;
773 }
774 realbuffer = abuffer;
775 }
776 else
777 realbuffer = buffer;
778 /* step 4: fill the buffer */
779 /* Since we've analyzed how much space we need for the worst case,
780 we don't have to resize the string.
781 There can be no errors beyond this point. */
782 string = PyUnicode_FromUnicode(NULL, n);
783 if (!string)
784 goto fail;
785
786 s = PyUnicode_AS_UNICODE(string);
787 callresult = callresults;
788
789 for (f = format; *f; f++) {
790 if (*f == '%') {
791 const char* p = f++;
792 int longflag = 0;
793 int size_tflag = 0;
794 zeropad = (*f == '0');
795 /* parse the width.precision part */
796 width = 0;
Neal Norwitzade57d02008-03-23 06:19:57 +0000797 while (isdigit((unsigned)*f))
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000798 width = (width*10) + *f++ - '0';
799 precision = 0;
800 if (*f == '.') {
801 f++;
Neal Norwitzade57d02008-03-23 06:19:57 +0000802 while (isdigit((unsigned)*f))
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000803 precision = (precision*10) + *f++ - '0';
804 }
805 /* handle the long flag, but only for %ld and %lu.
806 others can be added when necessary. */
807 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
808 longflag = 1;
809 ++f;
810 }
811 /* handle the size_t flag. */
812 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
813 size_tflag = 1;
814 ++f;
815 }
816
817 switch (*f) {
818 case 'c':
819 *s++ = va_arg(vargs, int);
820 break;
821 case 'd':
822 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
823 if (longflag)
824 sprintf(realbuffer, fmt, va_arg(vargs, long));
825 else if (size_tflag)
826 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
827 else
828 sprintf(realbuffer, fmt, va_arg(vargs, int));
829 appendstring(realbuffer);
830 break;
831 case 'u':
832 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
833 if (longflag)
834 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
835 else if (size_tflag)
836 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
837 else
838 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
839 appendstring(realbuffer);
840 break;
841 case 'i':
842 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
843 sprintf(realbuffer, fmt, va_arg(vargs, int));
844 appendstring(realbuffer);
845 break;
846 case 'x':
847 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
848 sprintf(realbuffer, fmt, va_arg(vargs, int));
849 appendstring(realbuffer);
850 break;
851 case 's':
852 {
853 /* Parameter must be UTF-8 encoded.
854 In case of encoding errors, use
855 the replacement character. */
856 PyObject *u;
857 p = va_arg(vargs, char*);
858 u = PyUnicode_DecodeUTF8(p, strlen(p),
859 "replace");
860 if (!u)
861 goto fail;
862 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
863 PyUnicode_GET_SIZE(u));
864 s += PyUnicode_GET_SIZE(u);
865 Py_DECREF(u);
866 break;
867 }
868 case 'U':
869 {
870 PyObject *obj = va_arg(vargs, PyObject *);
871 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
872 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
873 s += size;
874 break;
875 }
876 case 'V':
877 {
878 PyObject *obj = va_arg(vargs, PyObject *);
879 const char *str = va_arg(vargs, const char *);
880 if (obj) {
881 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
882 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
883 s += size;
884 } else {
885 appendstring(str);
886 }
887 break;
888 }
889 case 'S':
890 case 'R':
891 {
892 Py_UNICODE *ucopy;
893 Py_ssize_t usize;
894 Py_ssize_t upos;
895 /* unused, since we already have the result */
896 (void) va_arg(vargs, PyObject *);
897 ucopy = PyUnicode_AS_UNICODE(*callresult);
898 usize = PyUnicode_GET_SIZE(*callresult);
899 for (upos = 0; upos<usize;)
900 *s++ = ucopy[upos++];
901 /* We're done with the unicode()/repr() => forget it */
902 Py_DECREF(*callresult);
903 /* switch to next unicode()/repr() result */
904 ++callresult;
905 break;
906 }
907 case 'p':
908 sprintf(buffer, "%p", va_arg(vargs, void*));
909 /* %p is ill-defined: ensure leading 0x. */
910 if (buffer[1] == 'X')
911 buffer[1] = 'x';
912 else if (buffer[1] != 'x') {
913 memmove(buffer+2, buffer, strlen(buffer)+1);
914 buffer[0] = '0';
915 buffer[1] = 'x';
916 }
917 appendstring(buffer);
918 break;
919 case '%':
920 *s++ = '%';
921 break;
922 default:
923 appendstring(p);
924 goto end;
925 }
926 } else
927 *s++ = *f;
928 }
929
930 end:
931 if (callresults)
Neal Norwitz419fd492008-03-17 20:22:43 +0000932 PyObject_Free(callresults);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000933 if (abuffer)
Neal Norwitz419fd492008-03-17 20:22:43 +0000934 PyObject_Free(abuffer);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000935 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
936 return string;
937 fail:
938 if (callresults) {
939 PyObject **callresult2 = callresults;
940 while (callresult2 < callresult) {
941 Py_DECREF(*callresult2);
942 ++callresult2;
943 }
Neal Norwitz419fd492008-03-17 20:22:43 +0000944 PyObject_Free(callresults);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000945 }
946 if (abuffer)
Neal Norwitz419fd492008-03-17 20:22:43 +0000947 PyObject_Free(abuffer);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000948 return NULL;
949}
950
951#undef appendstring
952
953PyObject *
954PyUnicode_FromFormat(const char *format, ...)
955{
956 PyObject* ret;
957 va_list vargs;
958
959#ifdef HAVE_STDARG_PROTOTYPES
960 va_start(vargs, format);
961#else
962 va_start(vargs);
963#endif
964 ret = PyUnicode_FromFormatV(format, vargs);
965 va_end(vargs);
966 return ret;
967}
968
Martin v. Löwis18e16552006-02-15 17:27:45 +0000969Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
970 wchar_t *w,
971 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000972{
973 if (unicode == NULL) {
974 PyErr_BadInternalCall();
975 return -1;
976 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000977
978 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000979 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000980 size = PyUnicode_GET_SIZE(unicode) + 1;
981
Guido van Rossumd57fd912000-03-10 22:53:23 +0000982#ifdef HAVE_USABLE_WCHAR_T
983 memcpy(w, unicode->str, size * sizeof(wchar_t));
984#else
985 {
986 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000987 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000988 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000989 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000990 *w++ = *u++;
991 }
992#endif
993
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000994 if (size > PyUnicode_GET_SIZE(unicode))
995 return PyUnicode_GET_SIZE(unicode);
996 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000997 return size;
998}
999
1000#endif
1001
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001002PyObject *PyUnicode_FromOrdinal(int ordinal)
1003{
Hye-Shik Chang40574832004-04-06 07:24:51 +00001004 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001005
1006#ifdef Py_UNICODE_WIDE
1007 if (ordinal < 0 || ordinal > 0x10ffff) {
1008 PyErr_SetString(PyExc_ValueError,
1009 "unichr() arg not in range(0x110000) "
1010 "(wide Python build)");
1011 return NULL;
1012 }
1013#else
1014 if (ordinal < 0 || ordinal > 0xffff) {
1015 PyErr_SetString(PyExc_ValueError,
1016 "unichr() arg not in range(0x10000) "
1017 "(narrow Python build)");
1018 return NULL;
1019 }
1020#endif
1021
Hye-Shik Chang40574832004-04-06 07:24:51 +00001022 s[0] = (Py_UNICODE)ordinal;
1023 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001024}
1025
Guido van Rossumd57fd912000-03-10 22:53:23 +00001026PyObject *PyUnicode_FromObject(register PyObject *obj)
1027{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001028 /* XXX Perhaps we should make this API an alias of
1029 PyObject_Unicode() instead ?! */
1030 if (PyUnicode_CheckExact(obj)) {
1031 Py_INCREF(obj);
1032 return obj;
1033 }
1034 if (PyUnicode_Check(obj)) {
1035 /* For a Unicode subtype that's not a Unicode object,
1036 return a true Unicode object with the same data. */
1037 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1038 PyUnicode_GET_SIZE(obj));
1039 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001040 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1041}
1042
1043PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1044 const char *encoding,
1045 const char *errors)
1046{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001047 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001048 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001049 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001050
Guido van Rossumd57fd912000-03-10 22:53:23 +00001051 if (obj == NULL) {
1052 PyErr_BadInternalCall();
1053 return NULL;
1054 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001055
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001056#if 0
1057 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001058 that no encodings is given and then redirect to
1059 PyObject_Unicode() which then applies the additional logic for
1060 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001061
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001062 NOTE: This API should really only be used for object which
1063 represent *encoded* Unicode !
1064
1065 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001066 if (PyUnicode_Check(obj)) {
1067 if (encoding) {
1068 PyErr_SetString(PyExc_TypeError,
1069 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001070 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001071 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001072 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001073 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001074#else
1075 if (PyUnicode_Check(obj)) {
1076 PyErr_SetString(PyExc_TypeError,
1077 "decoding Unicode is not supported");
1078 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001079 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001080#endif
1081
1082 /* Coerce object */
1083 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001084 s = PyString_AS_STRING(obj);
1085 len = PyString_GET_SIZE(obj);
Christian Heimes1a6387e2008-03-26 12:49:49 +00001086 }
Christian Heimes3497f942008-05-26 12:29:14 +00001087 else if (PyByteArray_Check(obj)) {
Christian Heimes1a6387e2008-03-26 12:49:49 +00001088 /* Python 2.x specific */
1089 PyErr_Format(PyExc_TypeError,
1090 "decoding bytearray is not supported");
1091 return NULL;
1092 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001093 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1094 /* Overwrite the error message with something more useful in
1095 case of a TypeError. */
1096 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001097 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001098 "coercing to Unicode: need string or buffer, "
1099 "%.80s found",
Christian Heimese93237d2007-12-19 02:37:44 +00001100 Py_TYPE(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001101 goto onError;
1102 }
Tim Petersced69f82003-09-16 20:30:58 +00001103
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001104 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001105 if (len == 0) {
1106 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001107 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001108 }
Tim Petersced69f82003-09-16 20:30:58 +00001109 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001110 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001111
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001112 return v;
1113
1114 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001115 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001116}
1117
1118PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001119 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001120 const char *encoding,
1121 const char *errors)
1122{
1123 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001124
1125 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001126 encoding = PyUnicode_GetDefaultEncoding();
1127
1128 /* Shortcuts for common default encodings */
1129 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001130 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001131 else if (strcmp(encoding, "latin-1") == 0)
1132 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001133#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1134 else if (strcmp(encoding, "mbcs") == 0)
1135 return PyUnicode_DecodeMBCS(s, size, errors);
1136#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001137 else if (strcmp(encoding, "ascii") == 0)
1138 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001139
1140 /* Decode via the codec registry */
1141 buffer = PyBuffer_FromMemory((void *)s, size);
1142 if (buffer == NULL)
1143 goto onError;
1144 unicode = PyCodec_Decode(buffer, encoding, errors);
1145 if (unicode == NULL)
1146 goto onError;
1147 if (!PyUnicode_Check(unicode)) {
1148 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001149 "decoder did not return an unicode object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001150 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001151 Py_DECREF(unicode);
1152 goto onError;
1153 }
1154 Py_DECREF(buffer);
1155 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001156
Guido van Rossumd57fd912000-03-10 22:53:23 +00001157 onError:
1158 Py_XDECREF(buffer);
1159 return NULL;
1160}
1161
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001162PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1163 const char *encoding,
1164 const char *errors)
1165{
1166 PyObject *v;
1167
1168 if (!PyUnicode_Check(unicode)) {
1169 PyErr_BadArgument();
1170 goto onError;
1171 }
1172
1173 if (encoding == NULL)
1174 encoding = PyUnicode_GetDefaultEncoding();
1175
1176 /* Decode via the codec registry */
1177 v = PyCodec_Decode(unicode, encoding, errors);
1178 if (v == NULL)
1179 goto onError;
1180 return v;
1181
1182 onError:
1183 return NULL;
1184}
1185
Guido van Rossumd57fd912000-03-10 22:53:23 +00001186PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001187 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001188 const char *encoding,
1189 const char *errors)
1190{
1191 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001192
Guido van Rossumd57fd912000-03-10 22:53:23 +00001193 unicode = PyUnicode_FromUnicode(s, size);
1194 if (unicode == NULL)
1195 return NULL;
1196 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1197 Py_DECREF(unicode);
1198 return v;
1199}
1200
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001201PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1202 const char *encoding,
1203 const char *errors)
1204{
1205 PyObject *v;
1206
1207 if (!PyUnicode_Check(unicode)) {
1208 PyErr_BadArgument();
1209 goto onError;
1210 }
1211
1212 if (encoding == NULL)
1213 encoding = PyUnicode_GetDefaultEncoding();
1214
1215 /* Encode via the codec registry */
1216 v = PyCodec_Encode(unicode, encoding, errors);
1217 if (v == NULL)
1218 goto onError;
1219 return v;
1220
1221 onError:
1222 return NULL;
1223}
1224
Guido van Rossumd57fd912000-03-10 22:53:23 +00001225PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1226 const char *encoding,
1227 const char *errors)
1228{
1229 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001230
Guido van Rossumd57fd912000-03-10 22:53:23 +00001231 if (!PyUnicode_Check(unicode)) {
1232 PyErr_BadArgument();
1233 goto onError;
1234 }
Fred Drakee4315f52000-05-09 19:53:39 +00001235
Tim Petersced69f82003-09-16 20:30:58 +00001236 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001237 encoding = PyUnicode_GetDefaultEncoding();
1238
1239 /* Shortcuts for common default encodings */
1240 if (errors == NULL) {
1241 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001242 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001243 else if (strcmp(encoding, "latin-1") == 0)
1244 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001245#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1246 else if (strcmp(encoding, "mbcs") == 0)
1247 return PyUnicode_AsMBCSString(unicode);
1248#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001249 else if (strcmp(encoding, "ascii") == 0)
1250 return PyUnicode_AsASCIIString(unicode);
1251 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001252
1253 /* Encode via the codec registry */
1254 v = PyCodec_Encode(unicode, encoding, errors);
1255 if (v == NULL)
1256 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001257 if (!PyString_Check(v)) {
1258 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001259 "encoder did not return a string object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001260 Py_TYPE(v)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001261 Py_DECREF(v);
1262 goto onError;
1263 }
1264 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001265
Guido van Rossumd57fd912000-03-10 22:53:23 +00001266 onError:
1267 return NULL;
1268}
1269
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001270PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1271 const char *errors)
1272{
1273 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1274
1275 if (v)
1276 return v;
1277 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1278 if (v && errors == NULL)
1279 ((PyUnicodeObject *)unicode)->defenc = v;
1280 return v;
1281}
1282
Guido van Rossumd57fd912000-03-10 22:53:23 +00001283Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1284{
1285 if (!PyUnicode_Check(unicode)) {
1286 PyErr_BadArgument();
1287 goto onError;
1288 }
1289 return PyUnicode_AS_UNICODE(unicode);
1290
1291 onError:
1292 return NULL;
1293}
1294
Martin v. Löwis18e16552006-02-15 17:27:45 +00001295Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001296{
1297 if (!PyUnicode_Check(unicode)) {
1298 PyErr_BadArgument();
1299 goto onError;
1300 }
1301 return PyUnicode_GET_SIZE(unicode);
1302
1303 onError:
1304 return -1;
1305}
1306
Thomas Wouters78890102000-07-22 19:25:51 +00001307const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001308{
1309 return unicode_default_encoding;
1310}
1311
1312int PyUnicode_SetDefaultEncoding(const char *encoding)
1313{
1314 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001315
Fred Drakee4315f52000-05-09 19:53:39 +00001316 /* Make sure the encoding is valid. As side effect, this also
1317 loads the encoding into the codec registry cache. */
1318 v = _PyCodec_Lookup(encoding);
1319 if (v == NULL)
1320 goto onError;
1321 Py_DECREF(v);
1322 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +00001323 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +00001324 sizeof(unicode_default_encoding));
1325 return 0;
1326
1327 onError:
1328 return -1;
1329}
1330
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001331/* error handling callback helper:
1332 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001333 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001334 and adjust various state variables.
1335 return 0 on success, -1 on error
1336*/
1337
1338static
1339int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1340 const char *encoding, const char *reason,
Walter Dörwald87578782007-08-30 15:30:09 +00001341 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1342 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001343 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001344{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001345 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001346
1347 PyObject *restuple = NULL;
1348 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001349 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1350 Py_ssize_t requiredsize;
1351 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001352 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001353 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001354 int res = -1;
1355
1356 if (*errorHandler == NULL) {
1357 *errorHandler = PyCodec_LookupError(errors);
1358 if (*errorHandler == NULL)
1359 goto onError;
1360 }
1361
1362 if (*exceptionObject == NULL) {
1363 *exceptionObject = PyUnicodeDecodeError_Create(
1364 encoding, input, insize, *startinpos, *endinpos, reason);
1365 if (*exceptionObject == NULL)
1366 goto onError;
1367 }
1368 else {
1369 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1370 goto onError;
1371 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1372 goto onError;
1373 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1374 goto onError;
1375 }
1376
1377 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1378 if (restuple == NULL)
1379 goto onError;
1380 if (!PyTuple_Check(restuple)) {
1381 PyErr_Format(PyExc_TypeError, &argparse[4]);
1382 goto onError;
1383 }
1384 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1385 goto onError;
1386 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001387 newpos = insize+newpos;
1388 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001389 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001390 goto onError;
1391 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001392
1393 /* need more space? (at least enough for what we
1394 have+the replacement+the rest of the string (starting
1395 at the new input position), so we won't have to check space
1396 when there are no errors in the rest of the string) */
1397 repptr = PyUnicode_AS_UNICODE(repunicode);
1398 repsize = PyUnicode_GET_SIZE(repunicode);
1399 requiredsize = *outpos + repsize + insize-newpos;
1400 if (requiredsize > outsize) {
1401 if (requiredsize<2*outsize)
1402 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001403 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001404 goto onError;
1405 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1406 }
1407 *endinpos = newpos;
1408 *inptr = input + newpos;
1409 Py_UNICODE_COPY(*outptr, repptr, repsize);
1410 *outptr += repsize;
1411 *outpos += repsize;
1412 /* we made it! */
1413 res = 0;
1414
1415 onError:
1416 Py_XDECREF(restuple);
1417 return res;
1418}
1419
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001420/* --- UTF-7 Codec -------------------------------------------------------- */
1421
1422/* see RFC2152 for details */
1423
Tim Petersced69f82003-09-16 20:30:58 +00001424static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001425char utf7_special[128] = {
1426 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1427 encoded:
1428 0 - not special
1429 1 - special
1430 2 - whitespace (optional)
1431 3 - RFC2152 Set O (optional) */
1432 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1433 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1434 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1435 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1436 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1437 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1438 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1439 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1440
1441};
1442
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001443/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1444 warnings about the comparison always being false; since
1445 utf7_special[0] is 1, we can safely make that one comparison
1446 true */
1447
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001448#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001449 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001450 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001451 (encodeO && (utf7_special[(c)] == 3)))
1452
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001453#define B64(n) \
1454 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1455#define B64CHAR(c) \
1456 (isalnum(c) || (c) == '+' || (c) == '/')
1457#define UB64(c) \
1458 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1459 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001460
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001461#define ENCODE(out, ch, bits) \
1462 while (bits >= 6) { \
1463 *out++ = B64(ch >> (bits-6)); \
1464 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001465 }
1466
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001467#define DECODE(out, ch, bits, surrogate) \
1468 while (bits >= 16) { \
1469 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1470 bits -= 16; \
1471 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001472 /* We have already generated an error for the high surrogate \
1473 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001474 surrogate = 0; \
1475 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001476 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001477 it in a 16-bit character */ \
1478 surrogate = 1; \
1479 errmsg = "code pairs are not supported"; \
1480 goto utf7Error; \
1481 } else { \
1482 *out++ = outCh; \
1483 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001484 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001485
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001486PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001487 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001488 const char *errors)
1489{
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001490 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1491}
1492
1493PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1494 Py_ssize_t size,
1495 const char *errors,
1496 Py_ssize_t *consumed)
1497{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001498 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001499 Py_ssize_t startinpos;
1500 Py_ssize_t endinpos;
1501 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001502 const char *e;
1503 PyUnicodeObject *unicode;
1504 Py_UNICODE *p;
1505 const char *errmsg = "";
1506 int inShift = 0;
1507 unsigned int bitsleft = 0;
1508 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001509 int surrogate = 0;
1510 PyObject *errorHandler = NULL;
1511 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001512
1513 unicode = _PyUnicode_New(size);
1514 if (!unicode)
1515 return NULL;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001516 if (size == 0) {
1517 if (consumed)
1518 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001519 return (PyObject *)unicode;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001520 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001521
1522 p = unicode->str;
1523 e = s + size;
1524
1525 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001526 Py_UNICODE ch;
1527 restart:
1528 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001529
1530 if (inShift) {
1531 if ((ch == '-') || !B64CHAR(ch)) {
1532 inShift = 0;
1533 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001534
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001535 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1536 if (bitsleft >= 6) {
1537 /* The shift sequence has a partial character in it. If
1538 bitsleft < 6 then we could just classify it as padding
1539 but that is not the case here */
1540
1541 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001542 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001543 }
1544 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001545 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001546 here so indicate the potential of a misencoded character. */
1547
1548 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1549 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1550 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001551 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001552 }
1553
1554 if (ch == '-') {
1555 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001556 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001557 inShift = 1;
1558 }
1559 } else if (SPECIAL(ch,0,0)) {
1560 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001561 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001562 } else {
1563 *p++ = ch;
1564 }
1565 } else {
1566 charsleft = (charsleft << 6) | UB64(ch);
1567 bitsleft += 6;
1568 s++;
1569 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1570 }
1571 }
1572 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001573 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001574 s++;
1575 if (s < e && *s == '-') {
1576 s++;
1577 *p++ = '+';
1578 } else
1579 {
1580 inShift = 1;
1581 bitsleft = 0;
1582 }
1583 }
1584 else if (SPECIAL(ch,0,0)) {
Walter Dörwald9d045422007-08-30 15:34:55 +00001585 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001586 errmsg = "unexpected special character";
1587 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001588 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001589 }
1590 else {
1591 *p++ = ch;
1592 s++;
1593 }
1594 continue;
1595 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001596 outpos = p-PyUnicode_AS_UNICODE(unicode);
1597 endinpos = s-starts;
1598 if (unicode_decode_call_errorhandler(
1599 errors, &errorHandler,
1600 "utf7", errmsg,
1601 starts, size, &startinpos, &endinpos, &exc, &s,
1602 (PyObject **)&unicode, &outpos, &p))
1603 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001604 }
1605
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001606 if (inShift && !consumed) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001607 outpos = p-PyUnicode_AS_UNICODE(unicode);
1608 endinpos = size;
1609 if (unicode_decode_call_errorhandler(
1610 errors, &errorHandler,
1611 "utf7", "unterminated shift sequence",
1612 starts, size, &startinpos, &endinpos, &exc, &s,
1613 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001614 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001615 if (s < e)
1616 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001617 }
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001618 if (consumed) {
1619 if(inShift)
1620 *consumed = startinpos;
1621 else
1622 *consumed = s-starts;
1623 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001624
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001625 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001626 goto onError;
1627
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001628 Py_XDECREF(errorHandler);
1629 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001630 return (PyObject *)unicode;
1631
1632onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001633 Py_XDECREF(errorHandler);
1634 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001635 Py_DECREF(unicode);
1636 return NULL;
1637}
1638
1639
1640PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001641 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001642 int encodeSetO,
1643 int encodeWhiteSpace,
1644 const char *errors)
1645{
1646 PyObject *v;
1647 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001648 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001649 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001650 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001651 unsigned int bitsleft = 0;
1652 unsigned long charsleft = 0;
1653 char * out;
1654 char * start;
1655
1656 if (size == 0)
1657 return PyString_FromStringAndSize(NULL, 0);
1658
1659 v = PyString_FromStringAndSize(NULL, cbAllocated);
1660 if (v == NULL)
1661 return NULL;
1662
1663 start = out = PyString_AS_STRING(v);
1664 for (;i < size; ++i) {
1665 Py_UNICODE ch = s[i];
1666
1667 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001668 if (ch == '+') {
1669 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001670 *out++ = '-';
1671 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1672 charsleft = ch;
1673 bitsleft = 16;
1674 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001675 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001676 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001677 } else {
1678 *out++ = (char) ch;
1679 }
1680 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001681 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1682 *out++ = B64(charsleft << (6-bitsleft));
1683 charsleft = 0;
1684 bitsleft = 0;
1685 /* Characters not in the BASE64 set implicitly unshift the sequence
1686 so no '-' is required, except if the character is itself a '-' */
1687 if (B64CHAR(ch) || ch == '-') {
1688 *out++ = '-';
1689 }
1690 inShift = 0;
1691 *out++ = (char) ch;
1692 } else {
1693 bitsleft += 16;
1694 charsleft = (charsleft << 16) | ch;
1695 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1696
1697 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001698 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001699 or '-' then the shift sequence will be terminated implicitly and we
1700 don't have to insert a '-'. */
1701
1702 if (bitsleft == 0) {
1703 if (i + 1 < size) {
1704 Py_UNICODE ch2 = s[i+1];
1705
1706 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001707
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001708 } else if (B64CHAR(ch2) || ch2 == '-') {
1709 *out++ = '-';
1710 inShift = 0;
1711 } else {
1712 inShift = 0;
1713 }
1714
1715 }
1716 else {
1717 *out++ = '-';
1718 inShift = 0;
1719 }
1720 }
Tim Petersced69f82003-09-16 20:30:58 +00001721 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001722 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001723 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001724 if (bitsleft) {
1725 *out++= B64(charsleft << (6-bitsleft) );
1726 *out++ = '-';
1727 }
1728
Tim Peters5de98422002-04-27 18:44:32 +00001729 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001730 return v;
1731}
1732
1733#undef SPECIAL
1734#undef B64
1735#undef B64CHAR
1736#undef UB64
1737#undef ENCODE
1738#undef DECODE
1739
Guido van Rossumd57fd912000-03-10 22:53:23 +00001740/* --- UTF-8 Codec -------------------------------------------------------- */
1741
Tim Petersced69f82003-09-16 20:30:58 +00001742static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001743char utf8_code_length[256] = {
1744 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1745 illegal prefix. see RFC 2279 for details */
1746 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1747 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1748 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1749 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1750 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1751 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1752 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1753 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1754 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1755 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1756 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1757 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1758 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1759 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1760 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1761 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1762};
1763
Guido van Rossumd57fd912000-03-10 22:53:23 +00001764PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001765 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001766 const char *errors)
1767{
Walter Dörwald69652032004-09-07 20:24:22 +00001768 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1769}
1770
1771PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001772 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001773 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001774 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001775{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001776 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001777 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001778 Py_ssize_t startinpos;
1779 Py_ssize_t endinpos;
1780 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001781 const char *e;
1782 PyUnicodeObject *unicode;
1783 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001784 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001785 PyObject *errorHandler = NULL;
1786 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001787
1788 /* Note: size will always be longer than the resulting Unicode
1789 character count */
1790 unicode = _PyUnicode_New(size);
1791 if (!unicode)
1792 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001793 if (size == 0) {
1794 if (consumed)
1795 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001796 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001797 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001798
1799 /* Unpack UTF-8 encoded data */
1800 p = unicode->str;
1801 e = s + size;
1802
1803 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001804 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001805
1806 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001807 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001808 s++;
1809 continue;
1810 }
1811
1812 n = utf8_code_length[ch];
1813
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001814 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001815 if (consumed)
1816 break;
1817 else {
1818 errmsg = "unexpected end of data";
1819 startinpos = s-starts;
1820 endinpos = size;
1821 goto utf8Error;
1822 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001823 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001824
1825 switch (n) {
1826
1827 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001828 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001829 startinpos = s-starts;
1830 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001831 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001832
1833 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001834 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001835 startinpos = s-starts;
1836 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001837 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001838
1839 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001840 if ((s[1] & 0xc0) != 0x80) {
1841 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001842 startinpos = s-starts;
1843 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001844 goto utf8Error;
1845 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001846 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001847 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001848 startinpos = s-starts;
1849 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001850 errmsg = "illegal encoding";
1851 goto utf8Error;
1852 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001853 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001854 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001855 break;
1856
1857 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001858 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001859 (s[2] & 0xc0) != 0x80) {
1860 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001861 startinpos = s-starts;
1862 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001863 goto utf8Error;
1864 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001865 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001866 if (ch < 0x0800) {
1867 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001868 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001869
1870 XXX For wide builds (UCS-4) we should probably try
1871 to recombine the surrogates into a single code
1872 unit.
1873 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001874 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001875 startinpos = s-starts;
1876 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001877 goto utf8Error;
1878 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001879 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001880 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001881 break;
1882
1883 case 4:
1884 if ((s[1] & 0xc0) != 0x80 ||
1885 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001886 (s[3] & 0xc0) != 0x80) {
1887 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001888 startinpos = s-starts;
1889 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001890 goto utf8Error;
1891 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001892 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1893 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1894 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001895 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001896 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001897 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001898 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001899 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001900 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001901 startinpos = s-starts;
1902 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001903 goto utf8Error;
1904 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001905#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001906 *p++ = (Py_UNICODE)ch;
1907#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001908 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001909
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001910 /* translate from 10000..10FFFF to 0..FFFF */
1911 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001912
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001913 /* high surrogate = top 10 bits added to D800 */
1914 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001915
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001916 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001917 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001918#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001919 break;
1920
1921 default:
1922 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001923 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001924 startinpos = s-starts;
1925 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001926 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001927 }
1928 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001929 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001930
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001931 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001932 outpos = p-PyUnicode_AS_UNICODE(unicode);
1933 if (unicode_decode_call_errorhandler(
1934 errors, &errorHandler,
1935 "utf8", errmsg,
1936 starts, size, &startinpos, &endinpos, &exc, &s,
1937 (PyObject **)&unicode, &outpos, &p))
1938 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001939 }
Walter Dörwald69652032004-09-07 20:24:22 +00001940 if (consumed)
1941 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001942
1943 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001944 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001945 goto onError;
1946
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001947 Py_XDECREF(errorHandler);
1948 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001949 return (PyObject *)unicode;
1950
1951onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001952 Py_XDECREF(errorHandler);
1953 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001954 Py_DECREF(unicode);
1955 return NULL;
1956}
1957
Tim Peters602f7402002-04-27 18:03:26 +00001958/* Allocation strategy: if the string is short, convert into a stack buffer
1959 and allocate exactly as much space needed at the end. Else allocate the
1960 maximum possible needed (4 result bytes per Unicode character), and return
1961 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001962*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001963PyObject *
1964PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001965 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001966 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001967{
Tim Peters602f7402002-04-27 18:03:26 +00001968#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001969
Martin v. Löwis18e16552006-02-15 17:27:45 +00001970 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001971 PyObject *v; /* result string object */
1972 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001973 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001974 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001975 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001976
Tim Peters602f7402002-04-27 18:03:26 +00001977 assert(s != NULL);
1978 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001979
Tim Peters602f7402002-04-27 18:03:26 +00001980 if (size <= MAX_SHORT_UNICHARS) {
1981 /* Write into the stack buffer; nallocated can't overflow.
1982 * At the end, we'll allocate exactly as much heap space as it
1983 * turns out we need.
1984 */
1985 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1986 v = NULL; /* will allocate after we're done */
1987 p = stackbuf;
1988 }
1989 else {
1990 /* Overallocate on the heap, and give the excess back at the end. */
1991 nallocated = size * 4;
1992 if (nallocated / 4 != size) /* overflow! */
1993 return PyErr_NoMemory();
1994 v = PyString_FromStringAndSize(NULL, nallocated);
1995 if (v == NULL)
1996 return NULL;
1997 p = PyString_AS_STRING(v);
1998 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001999
Tim Peters602f7402002-04-27 18:03:26 +00002000 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002001 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002002
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002003 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002004 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002005 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002006
Guido van Rossumd57fd912000-03-10 22:53:23 +00002007 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002008 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002009 *p++ = (char)(0xc0 | (ch >> 6));
2010 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002011 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002012 else {
Tim Peters602f7402002-04-27 18:03:26 +00002013 /* Encode UCS2 Unicode ordinals */
2014 if (ch < 0x10000) {
2015 /* Special case: check for high surrogate */
2016 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2017 Py_UCS4 ch2 = s[i];
2018 /* Check for low surrogate and combine the two to
2019 form a UCS4 value */
2020 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002021 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002022 i++;
2023 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002024 }
Tim Peters602f7402002-04-27 18:03:26 +00002025 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002026 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002027 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002028 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2029 *p++ = (char)(0x80 | (ch & 0x3f));
2030 continue;
2031 }
2032encodeUCS4:
2033 /* Encode UCS4 Unicode ordinals */
2034 *p++ = (char)(0xf0 | (ch >> 18));
2035 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2036 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2037 *p++ = (char)(0x80 | (ch & 0x3f));
2038 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002039 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002040
Tim Peters602f7402002-04-27 18:03:26 +00002041 if (v == NULL) {
2042 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002043 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002044 assert(nneeded <= nallocated);
2045 v = PyString_FromStringAndSize(stackbuf, nneeded);
2046 }
2047 else {
2048 /* Cut back to size actually needed. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002049 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002050 assert(nneeded <= nallocated);
2051 _PyString_Resize(&v, nneeded);
2052 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002053 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002054
Tim Peters602f7402002-04-27 18:03:26 +00002055#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002056}
2057
Guido van Rossumd57fd912000-03-10 22:53:23 +00002058PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2059{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002060 if (!PyUnicode_Check(unicode)) {
2061 PyErr_BadArgument();
2062 return NULL;
2063 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002064 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2065 PyUnicode_GET_SIZE(unicode),
2066 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002067}
2068
Walter Dörwald6e390802007-08-17 16:41:28 +00002069/* --- UTF-32 Codec ------------------------------------------------------- */
2070
2071PyObject *
2072PyUnicode_DecodeUTF32(const char *s,
2073 Py_ssize_t size,
2074 const char *errors,
2075 int *byteorder)
2076{
2077 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2078}
2079
2080PyObject *
2081PyUnicode_DecodeUTF32Stateful(const char *s,
2082 Py_ssize_t size,
2083 const char *errors,
2084 int *byteorder,
2085 Py_ssize_t *consumed)
2086{
2087 const char *starts = s;
2088 Py_ssize_t startinpos;
2089 Py_ssize_t endinpos;
2090 Py_ssize_t outpos;
2091 PyUnicodeObject *unicode;
2092 Py_UNICODE *p;
2093#ifndef Py_UNICODE_WIDE
2094 int i, pairs;
2095#else
2096 const int pairs = 0;
2097#endif
2098 const unsigned char *q, *e;
2099 int bo = 0; /* assume native ordering by default */
2100 const char *errmsg = "";
Walter Dörwald20b40d32007-08-17 16:52:50 +00002101 /* Offsets from q for retrieving bytes in the right order. */
2102#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2103 int iorder[] = {0, 1, 2, 3};
2104#else
2105 int iorder[] = {3, 2, 1, 0};
2106#endif
Walter Dörwald9ab80a92007-08-17 16:58:43 +00002107 PyObject *errorHandler = NULL;
2108 PyObject *exc = NULL;
Walter Dörwald6e390802007-08-17 16:41:28 +00002109 /* On narrow builds we split characters outside the BMP into two
2110 codepoints => count how much extra space we need. */
2111#ifndef Py_UNICODE_WIDE
2112 for (i = pairs = 0; i < size/4; i++)
2113 if (((Py_UCS4 *)s)[i] >= 0x10000)
2114 pairs++;
2115#endif
Walter Dörwald6e390802007-08-17 16:41:28 +00002116
2117 /* This might be one to much, because of a BOM */
2118 unicode = _PyUnicode_New((size+3)/4+pairs);
2119 if (!unicode)
2120 return NULL;
2121 if (size == 0)
2122 return (PyObject *)unicode;
2123
2124 /* Unpack UTF-32 encoded data */
2125 p = unicode->str;
2126 q = (unsigned char *)s;
2127 e = q + size;
2128
2129 if (byteorder)
2130 bo = *byteorder;
2131
2132 /* Check for BOM marks (U+FEFF) in the input and adjust current
2133 byte order setting accordingly. In native mode, the leading BOM
2134 mark is skipped, in all other modes, it is copied to the output
2135 stream as-is (giving a ZWNBSP character). */
2136 if (bo == 0) {
2137 if (size >= 4) {
2138 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2139 (q[iorder[1]] << 8) | q[iorder[0]];
2140#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2141 if (bom == 0x0000FEFF) {
2142 q += 4;
2143 bo = -1;
2144 }
2145 else if (bom == 0xFFFE0000) {
2146 q += 4;
2147 bo = 1;
2148 }
2149#else
2150 if (bom == 0x0000FEFF) {
2151 q += 4;
2152 bo = 1;
2153 }
2154 else if (bom == 0xFFFE0000) {
2155 q += 4;
2156 bo = -1;
2157 }
2158#endif
2159 }
2160 }
2161
2162 if (bo == -1) {
2163 /* force LE */
2164 iorder[0] = 0;
2165 iorder[1] = 1;
2166 iorder[2] = 2;
2167 iorder[3] = 3;
2168 }
2169 else if (bo == 1) {
2170 /* force BE */
2171 iorder[0] = 3;
2172 iorder[1] = 2;
2173 iorder[2] = 1;
2174 iorder[3] = 0;
2175 }
2176
2177 while (q < e) {
2178 Py_UCS4 ch;
2179 /* remaining bytes at the end? (size should be divisible by 4) */
2180 if (e-q<4) {
2181 if (consumed)
2182 break;
2183 errmsg = "truncated data";
2184 startinpos = ((const char *)q)-starts;
2185 endinpos = ((const char *)e)-starts;
2186 goto utf32Error;
2187 /* The remaining input chars are ignored if the callback
2188 chooses to skip the input */
2189 }
2190 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2191 (q[iorder[1]] << 8) | q[iorder[0]];
2192
2193 if (ch >= 0x110000)
2194 {
2195 errmsg = "codepoint not in range(0x110000)";
2196 startinpos = ((const char *)q)-starts;
2197 endinpos = startinpos+4;
2198 goto utf32Error;
2199 }
2200#ifndef Py_UNICODE_WIDE
2201 if (ch >= 0x10000)
2202 {
2203 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2204 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2205 }
2206 else
2207#endif
2208 *p++ = ch;
2209 q += 4;
2210 continue;
2211 utf32Error:
2212 outpos = p-PyUnicode_AS_UNICODE(unicode);
2213 if (unicode_decode_call_errorhandler(
2214 errors, &errorHandler,
2215 "utf32", errmsg,
2216 starts, size, &startinpos, &endinpos, &exc, &s,
2217 (PyObject **)&unicode, &outpos, &p))
2218 goto onError;
2219 }
2220
2221 if (byteorder)
2222 *byteorder = bo;
2223
2224 if (consumed)
2225 *consumed = (const char *)q-starts;
2226
2227 /* Adjust length */
2228 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2229 goto onError;
2230
2231 Py_XDECREF(errorHandler);
2232 Py_XDECREF(exc);
2233 return (PyObject *)unicode;
2234
2235onError:
2236 Py_DECREF(unicode);
2237 Py_XDECREF(errorHandler);
2238 Py_XDECREF(exc);
2239 return NULL;
2240}
2241
2242PyObject *
2243PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2244 Py_ssize_t size,
2245 const char *errors,
2246 int byteorder)
2247{
2248 PyObject *v;
2249 unsigned char *p;
2250#ifndef Py_UNICODE_WIDE
2251 int i, pairs;
2252#else
2253 const int pairs = 0;
2254#endif
2255 /* Offsets from p for storing byte pairs in the right order. */
2256#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2257 int iorder[] = {0, 1, 2, 3};
2258#else
2259 int iorder[] = {3, 2, 1, 0};
2260#endif
2261
2262#define STORECHAR(CH) \
2263 do { \
2264 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2265 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2266 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2267 p[iorder[0]] = (CH) & 0xff; \
2268 p += 4; \
2269 } while(0)
2270
2271 /* In narrow builds we can output surrogate pairs as one codepoint,
2272 so we need less space. */
2273#ifndef Py_UNICODE_WIDE
2274 for (i = pairs = 0; i < size-1; i++)
2275 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2276 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2277 pairs++;
2278#endif
2279 v = PyString_FromStringAndSize(NULL,
2280 4 * (size - pairs + (byteorder == 0)));
2281 if (v == NULL)
2282 return NULL;
2283
2284 p = (unsigned char *)PyString_AS_STRING(v);
2285 if (byteorder == 0)
2286 STORECHAR(0xFEFF);
2287 if (size == 0)
2288 return v;
2289
2290 if (byteorder == -1) {
2291 /* force LE */
2292 iorder[0] = 0;
2293 iorder[1] = 1;
2294 iorder[2] = 2;
2295 iorder[3] = 3;
2296 }
2297 else if (byteorder == 1) {
2298 /* force BE */
2299 iorder[0] = 3;
2300 iorder[1] = 2;
2301 iorder[2] = 1;
2302 iorder[3] = 0;
2303 }
2304
2305 while (size-- > 0) {
2306 Py_UCS4 ch = *s++;
2307#ifndef Py_UNICODE_WIDE
2308 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2309 Py_UCS4 ch2 = *s;
2310 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2311 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2312 s++;
2313 size--;
2314 }
2315 }
2316#endif
2317 STORECHAR(ch);
2318 }
2319 return v;
2320#undef STORECHAR
2321}
2322
2323PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2324{
2325 if (!PyUnicode_Check(unicode)) {
2326 PyErr_BadArgument();
2327 return NULL;
2328 }
2329 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2330 PyUnicode_GET_SIZE(unicode),
2331 NULL,
2332 0);
2333}
2334
Guido van Rossumd57fd912000-03-10 22:53:23 +00002335/* --- UTF-16 Codec ------------------------------------------------------- */
2336
Tim Peters772747b2001-08-09 22:21:55 +00002337PyObject *
2338PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002339 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002340 const char *errors,
2341 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002342{
Walter Dörwald69652032004-09-07 20:24:22 +00002343 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2344}
2345
2346PyObject *
2347PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002348 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002349 const char *errors,
2350 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002351 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002352{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002353 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002354 Py_ssize_t startinpos;
2355 Py_ssize_t endinpos;
2356 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002357 PyUnicodeObject *unicode;
2358 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002359 const unsigned char *q, *e;
2360 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002361 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002362 /* Offsets from q for retrieving byte pairs in the right order. */
2363#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2364 int ihi = 1, ilo = 0;
2365#else
2366 int ihi = 0, ilo = 1;
2367#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002368 PyObject *errorHandler = NULL;
2369 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002370
2371 /* Note: size will always be longer than the resulting Unicode
2372 character count */
2373 unicode = _PyUnicode_New(size);
2374 if (!unicode)
2375 return NULL;
2376 if (size == 0)
2377 return (PyObject *)unicode;
2378
2379 /* Unpack UTF-16 encoded data */
2380 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002381 q = (unsigned char *)s;
2382 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002383
2384 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002385 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002386
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002387 /* Check for BOM marks (U+FEFF) in the input and adjust current
2388 byte order setting accordingly. In native mode, the leading BOM
2389 mark is skipped, in all other modes, it is copied to the output
2390 stream as-is (giving a ZWNBSP character). */
2391 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002392 if (size >= 2) {
2393 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002394#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002395 if (bom == 0xFEFF) {
2396 q += 2;
2397 bo = -1;
2398 }
2399 else if (bom == 0xFFFE) {
2400 q += 2;
2401 bo = 1;
2402 }
Tim Petersced69f82003-09-16 20:30:58 +00002403#else
Walter Dörwald69652032004-09-07 20:24:22 +00002404 if (bom == 0xFEFF) {
2405 q += 2;
2406 bo = 1;
2407 }
2408 else if (bom == 0xFFFE) {
2409 q += 2;
2410 bo = -1;
2411 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002412#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002413 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002414 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002415
Tim Peters772747b2001-08-09 22:21:55 +00002416 if (bo == -1) {
2417 /* force LE */
2418 ihi = 1;
2419 ilo = 0;
2420 }
2421 else if (bo == 1) {
2422 /* force BE */
2423 ihi = 0;
2424 ilo = 1;
2425 }
2426
2427 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002428 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002429 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002430 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002431 if (consumed)
2432 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002433 errmsg = "truncated data";
2434 startinpos = ((const char *)q)-starts;
2435 endinpos = ((const char *)e)-starts;
2436 goto utf16Error;
2437 /* The remaining input chars are ignored if the callback
2438 chooses to skip the input */
2439 }
2440 ch = (q[ihi] << 8) | q[ilo];
2441
Tim Peters772747b2001-08-09 22:21:55 +00002442 q += 2;
2443
Guido van Rossumd57fd912000-03-10 22:53:23 +00002444 if (ch < 0xD800 || ch > 0xDFFF) {
2445 *p++ = ch;
2446 continue;
2447 }
2448
2449 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002450 if (q >= e) {
2451 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002452 startinpos = (((const char *)q)-2)-starts;
2453 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002454 goto utf16Error;
2455 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002456 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002457 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2458 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002459 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002460#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002461 *p++ = ch;
2462 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002463#else
2464 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002465#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002466 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002467 }
2468 else {
2469 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002470 startinpos = (((const char *)q)-4)-starts;
2471 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002472 goto utf16Error;
2473 }
2474
Guido van Rossumd57fd912000-03-10 22:53:23 +00002475 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002476 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002477 startinpos = (((const char *)q)-2)-starts;
2478 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002479 /* Fall through to report the error */
2480
2481 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002482 outpos = p-PyUnicode_AS_UNICODE(unicode);
2483 if (unicode_decode_call_errorhandler(
2484 errors, &errorHandler,
2485 "utf16", errmsg,
2486 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2487 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002488 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002489 }
2490
2491 if (byteorder)
2492 *byteorder = bo;
2493
Walter Dörwald69652032004-09-07 20:24:22 +00002494 if (consumed)
2495 *consumed = (const char *)q-starts;
2496
Guido van Rossumd57fd912000-03-10 22:53:23 +00002497 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002498 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002499 goto onError;
2500
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002501 Py_XDECREF(errorHandler);
2502 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002503 return (PyObject *)unicode;
2504
2505onError:
2506 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002507 Py_XDECREF(errorHandler);
2508 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002509 return NULL;
2510}
2511
Tim Peters772747b2001-08-09 22:21:55 +00002512PyObject *
2513PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002514 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002515 const char *errors,
2516 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002517{
2518 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002519 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002520#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00002521 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002522#else
2523 const int pairs = 0;
2524#endif
Tim Peters772747b2001-08-09 22:21:55 +00002525 /* Offsets from p for storing byte pairs in the right order. */
2526#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2527 int ihi = 1, ilo = 0;
2528#else
2529 int ihi = 0, ilo = 1;
2530#endif
2531
2532#define STORECHAR(CH) \
2533 do { \
2534 p[ihi] = ((CH) >> 8) & 0xff; \
2535 p[ilo] = (CH) & 0xff; \
2536 p += 2; \
2537 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002538
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002539#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002540 for (i = pairs = 0; i < size; i++)
2541 if (s[i] >= 0x10000)
2542 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002543#endif
Tim Petersced69f82003-09-16 20:30:58 +00002544 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002545 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002546 if (v == NULL)
2547 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002548
Tim Peters772747b2001-08-09 22:21:55 +00002549 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002550 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002551 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002552 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002553 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002554
2555 if (byteorder == -1) {
2556 /* force LE */
2557 ihi = 1;
2558 ilo = 0;
2559 }
2560 else if (byteorder == 1) {
2561 /* force BE */
2562 ihi = 0;
2563 ilo = 1;
2564 }
2565
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002566 while (size-- > 0) {
2567 Py_UNICODE ch = *s++;
2568 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002569#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002570 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002571 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2572 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002573 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002574#endif
Tim Peters772747b2001-08-09 22:21:55 +00002575 STORECHAR(ch);
2576 if (ch2)
2577 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002578 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002579 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002580#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002581}
2582
2583PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2584{
2585 if (!PyUnicode_Check(unicode)) {
2586 PyErr_BadArgument();
2587 return NULL;
2588 }
2589 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2590 PyUnicode_GET_SIZE(unicode),
2591 NULL,
2592 0);
2593}
2594
2595/* --- Unicode Escape Codec ----------------------------------------------- */
2596
Fredrik Lundh06d12682001-01-24 07:59:11 +00002597static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002598
Guido van Rossumd57fd912000-03-10 22:53:23 +00002599PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002600 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002601 const char *errors)
2602{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002603 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002604 Py_ssize_t startinpos;
2605 Py_ssize_t endinpos;
2606 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002607 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002608 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002609 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002610 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002611 char* message;
2612 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002613 PyObject *errorHandler = NULL;
2614 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002615
Guido van Rossumd57fd912000-03-10 22:53:23 +00002616 /* Escaped strings will always be longer than the resulting
2617 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002618 length after conversion to the true value.
2619 (but if the error callback returns a long replacement string
2620 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002621 v = _PyUnicode_New(size);
2622 if (v == NULL)
2623 goto onError;
2624 if (size == 0)
2625 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002626
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002627 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002628 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002629
Guido van Rossumd57fd912000-03-10 22:53:23 +00002630 while (s < end) {
2631 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002632 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002633 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002634
2635 /* Non-escape characters are interpreted as Unicode ordinals */
2636 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002637 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002638 continue;
2639 }
2640
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002641 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002642 /* \ - Escapes */
2643 s++;
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002644 c = *s++;
2645 if (s > end)
2646 c = '\0'; /* Invalid after \ */
2647 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002648
2649 /* \x escapes */
2650 case '\n': break;
2651 case '\\': *p++ = '\\'; break;
2652 case '\'': *p++ = '\''; break;
2653 case '\"': *p++ = '\"'; break;
2654 case 'b': *p++ = '\b'; break;
2655 case 'f': *p++ = '\014'; break; /* FF */
2656 case 't': *p++ = '\t'; break;
2657 case 'n': *p++ = '\n'; break;
2658 case 'r': *p++ = '\r'; break;
2659 case 'v': *p++ = '\013'; break; /* VT */
2660 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2661
2662 /* \OOO (octal) escapes */
2663 case '0': case '1': case '2': case '3':
2664 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002665 x = s[-1] - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002666 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002667 x = (x<<3) + *s++ - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002668 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002669 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002670 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002671 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002672 break;
2673
Fredrik Lundhccc74732001-02-18 22:13:49 +00002674 /* hex escapes */
2675 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002676 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002677 digits = 2;
2678 message = "truncated \\xXX escape";
2679 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002680
Fredrik Lundhccc74732001-02-18 22:13:49 +00002681 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002682 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002683 digits = 4;
2684 message = "truncated \\uXXXX escape";
2685 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002686
Fredrik Lundhccc74732001-02-18 22:13:49 +00002687 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002688 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002689 digits = 8;
2690 message = "truncated \\UXXXXXXXX escape";
2691 hexescape:
2692 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002693 outpos = p-PyUnicode_AS_UNICODE(v);
2694 if (s+digits>end) {
2695 endinpos = size;
2696 if (unicode_decode_call_errorhandler(
2697 errors, &errorHandler,
2698 "unicodeescape", "end of string in escape sequence",
2699 starts, size, &startinpos, &endinpos, &exc, &s,
2700 (PyObject **)&v, &outpos, &p))
2701 goto onError;
2702 goto nextByte;
2703 }
2704 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002705 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002706 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002707 endinpos = (s+i+1)-starts;
2708 if (unicode_decode_call_errorhandler(
2709 errors, &errorHandler,
2710 "unicodeescape", message,
2711 starts, size, &startinpos, &endinpos, &exc, &s,
2712 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002713 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002714 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002715 }
2716 chr = (chr<<4) & ~0xF;
2717 if (c >= '0' && c <= '9')
2718 chr += c - '0';
2719 else if (c >= 'a' && c <= 'f')
2720 chr += 10 + c - 'a';
2721 else
2722 chr += 10 + c - 'A';
2723 }
2724 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002725 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002726 /* _decoding_error will have already written into the
2727 target buffer. */
2728 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002729 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002730 /* when we get here, chr is a 32-bit unicode character */
2731 if (chr <= 0xffff)
2732 /* UCS-2 character */
2733 *p++ = (Py_UNICODE) chr;
2734 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002735 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002736 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002737#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002738 *p++ = chr;
2739#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002740 chr -= 0x10000L;
2741 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002742 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002743#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002744 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002745 endinpos = s-starts;
2746 outpos = p-PyUnicode_AS_UNICODE(v);
2747 if (unicode_decode_call_errorhandler(
2748 errors, &errorHandler,
2749 "unicodeescape", "illegal Unicode character",
2750 starts, size, &startinpos, &endinpos, &exc, &s,
2751 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002752 goto onError;
2753 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002754 break;
2755
2756 /* \N{name} */
2757 case 'N':
2758 message = "malformed \\N character escape";
2759 if (ucnhash_CAPI == NULL) {
2760 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002761 PyObject *m, *api;
Christian Heimes000a0742008-01-03 22:16:32 +00002762 m = PyImport_ImportModuleNoBlock("unicodedata");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002763 if (m == NULL)
2764 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002765 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002766 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002767 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002768 goto ucnhashError;
Anthony Baxtera6286212006-04-11 07:42:36 +00002769 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002770 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002771 if (ucnhash_CAPI == NULL)
2772 goto ucnhashError;
2773 }
2774 if (*s == '{') {
2775 const char *start = s+1;
2776 /* look for the closing brace */
2777 while (*s != '}' && s < end)
2778 s++;
2779 if (s > start && s < end && *s == '}') {
2780 /* found a name. look it up in the unicode database */
2781 message = "unknown Unicode character name";
2782 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002783 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002784 goto store;
2785 }
2786 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002787 endinpos = s-starts;
2788 outpos = p-PyUnicode_AS_UNICODE(v);
2789 if (unicode_decode_call_errorhandler(
2790 errors, &errorHandler,
2791 "unicodeescape", message,
2792 starts, size, &startinpos, &endinpos, &exc, &s,
2793 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002794 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002795 break;
2796
2797 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002798 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002799 message = "\\ at end of string";
2800 s--;
2801 endinpos = s-starts;
2802 outpos = p-PyUnicode_AS_UNICODE(v);
2803 if (unicode_decode_call_errorhandler(
2804 errors, &errorHandler,
2805 "unicodeescape", message,
2806 starts, size, &startinpos, &endinpos, &exc, &s,
2807 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002808 goto onError;
2809 }
2810 else {
2811 *p++ = '\\';
2812 *p++ = (unsigned char)s[-1];
2813 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002814 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002815 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002816 nextByte:
2817 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002818 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002819 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002820 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002821 Py_XDECREF(errorHandler);
2822 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002823 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002824
Fredrik Lundhccc74732001-02-18 22:13:49 +00002825ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002826 PyErr_SetString(
2827 PyExc_UnicodeError,
2828 "\\N escapes not supported (can't load unicodedata module)"
2829 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002830 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002831 Py_XDECREF(errorHandler);
2832 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002833 return NULL;
2834
Fredrik Lundhccc74732001-02-18 22:13:49 +00002835onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002836 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002837 Py_XDECREF(errorHandler);
2838 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002839 return NULL;
2840}
2841
2842/* Return a Unicode-Escape string version of the Unicode object.
2843
2844 If quotes is true, the string is enclosed in u"" or u'' quotes as
2845 appropriate.
2846
2847*/
2848
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00002849Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Fredrik Lundh95e2a912006-05-26 11:38:15 +00002850 Py_ssize_t size,
2851 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00002852{
2853 /* like wcschr, but doesn't stop at NULL characters */
2854
2855 while (size-- > 0) {
2856 if (*s == ch)
2857 return s;
2858 s++;
2859 }
2860
2861 return NULL;
2862}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002863
Guido van Rossumd57fd912000-03-10 22:53:23 +00002864static
2865PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002866 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002867 int quotes)
2868{
2869 PyObject *repr;
2870 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002871
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002872 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002873
Neal Norwitz17753ec2006-08-21 22:21:19 +00002874 /* XXX(nnorwitz): rather than over-allocating, it would be
2875 better to choose a different scheme. Perhaps scan the
2876 first N-chars of the string and allocate based on that size.
2877 */
2878 /* Initial allocation is based on the longest-possible unichr
2879 escape.
2880
2881 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2882 unichr, so in this case it's the longest unichr escape. In
2883 narrow (UTF-16) builds this is five chars per source unichr
2884 since there are two unichrs in the surrogate pair, so in narrow
2885 (UTF-16) builds it's not the longest unichr escape.
2886
2887 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2888 so in the narrow (UTF-16) build case it's the longest unichr
2889 escape.
2890 */
2891
2892 repr = PyString_FromStringAndSize(NULL,
2893 2
2894#ifdef Py_UNICODE_WIDE
2895 + 10*size
2896#else
2897 + 6*size
2898#endif
2899 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002900 if (repr == NULL)
2901 return NULL;
2902
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002903 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002904
2905 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002906 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00002907 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00002908 !findchar(s, size, '"')) ? '"' : '\'';
2909 }
2910 while (size-- > 0) {
2911 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002912
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002913 /* Escape quotes and backslashes */
2914 if ((quotes &&
2915 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002916 *p++ = '\\';
2917 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00002918 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002919 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002920
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002921#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002922 /* Map 21-bit characters to '\U00xxxxxx' */
2923 else if (ch >= 0x10000) {
2924 *p++ = '\\';
2925 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002926 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2927 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2928 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2929 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2930 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2931 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2932 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002933 *p++ = hexdigit[ch & 0x0000000F];
2934 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002935 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00002936#else
2937 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002938 else if (ch >= 0xD800 && ch < 0xDC00) {
2939 Py_UNICODE ch2;
2940 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002941
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002942 ch2 = *s++;
2943 size--;
2944 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2945 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2946 *p++ = '\\';
2947 *p++ = 'U';
2948 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2949 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2950 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2951 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2952 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2953 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2954 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2955 *p++ = hexdigit[ucs & 0x0000000F];
2956 continue;
2957 }
2958 /* Fall through: isolated surrogates are copied as-is */
2959 s--;
2960 size++;
2961 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00002962#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002963
Guido van Rossumd57fd912000-03-10 22:53:23 +00002964 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002965 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002966 *p++ = '\\';
2967 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002968 *p++ = hexdigit[(ch >> 12) & 0x000F];
2969 *p++ = hexdigit[(ch >> 8) & 0x000F];
2970 *p++ = hexdigit[(ch >> 4) & 0x000F];
2971 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002972 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002973
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002974 /* Map special whitespace to '\t', \n', '\r' */
2975 else if (ch == '\t') {
2976 *p++ = '\\';
2977 *p++ = 't';
2978 }
2979 else if (ch == '\n') {
2980 *p++ = '\\';
2981 *p++ = 'n';
2982 }
2983 else if (ch == '\r') {
2984 *p++ = '\\';
2985 *p++ = 'r';
2986 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002987
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002988 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002989 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002990 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002991 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002992 *p++ = hexdigit[(ch >> 4) & 0x000F];
2993 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002994 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002995
Guido van Rossumd57fd912000-03-10 22:53:23 +00002996 /* Copy everything else as-is */
2997 else
2998 *p++ = (char) ch;
2999 }
3000 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003001 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003002
3003 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00003004 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003005 return repr;
3006}
3007
3008PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003009 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003010{
3011 return unicodeescape_string(s, size, 0);
3012}
3013
3014PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3015{
3016 if (!PyUnicode_Check(unicode)) {
3017 PyErr_BadArgument();
3018 return NULL;
3019 }
3020 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3021 PyUnicode_GET_SIZE(unicode));
3022}
3023
3024/* --- Raw Unicode Escape Codec ------------------------------------------- */
3025
3026PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003027 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003028 const char *errors)
3029{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003030 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003031 Py_ssize_t startinpos;
3032 Py_ssize_t endinpos;
3033 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003034 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003035 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003036 const char *end;
3037 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003038 PyObject *errorHandler = NULL;
3039 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003040
Guido van Rossumd57fd912000-03-10 22:53:23 +00003041 /* Escaped strings will always be longer than the resulting
3042 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003043 length after conversion to the true value. (But decoding error
3044 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003045 v = _PyUnicode_New(size);
3046 if (v == NULL)
3047 goto onError;
3048 if (size == 0)
3049 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003050 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003051 end = s + size;
3052 while (s < end) {
3053 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00003054 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003055 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003056 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003057
3058 /* Non-escape characters are interpreted as Unicode ordinals */
3059 if (*s != '\\') {
3060 *p++ = (unsigned char)*s++;
3061 continue;
3062 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003063 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003064
3065 /* \u-escapes are only interpreted iff the number of leading
3066 backslashes if odd */
3067 bs = s;
3068 for (;s < end;) {
3069 if (*s != '\\')
3070 break;
3071 *p++ = (unsigned char)*s++;
3072 }
3073 if (((s - bs) & 1) == 0 ||
3074 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003075 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003076 continue;
3077 }
3078 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003079 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003080 s++;
3081
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003082 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003083 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003084 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003085 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003086 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003087 endinpos = s-starts;
3088 if (unicode_decode_call_errorhandler(
3089 errors, &errorHandler,
3090 "rawunicodeescape", "truncated \\uXXXX",
3091 starts, size, &startinpos, &endinpos, &exc, &s,
3092 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003093 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003094 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003095 }
3096 x = (x<<4) & ~0xF;
3097 if (c >= '0' && c <= '9')
3098 x += c - '0';
3099 else if (c >= 'a' && c <= 'f')
3100 x += 10 + c - 'a';
3101 else
3102 x += 10 + c - 'A';
3103 }
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003104 if (x <= 0xffff)
3105 /* UCS-2 character */
3106 *p++ = (Py_UNICODE) x;
3107 else if (x <= 0x10ffff) {
3108 /* UCS-4 character. Either store directly, or as
3109 surrogate pair. */
3110#ifdef Py_UNICODE_WIDE
Amaury Forgeot d'Arcfac02fa2008-03-24 21:04:10 +00003111 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003112#else
3113 x -= 0x10000L;
3114 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3115 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3116#endif
3117 } else {
3118 endinpos = s-starts;
3119 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003120 if (unicode_decode_call_errorhandler(
3121 errors, &errorHandler,
3122 "rawunicodeescape", "\\Uxxxxxxxx out of range",
3123 starts, size, &startinpos, &endinpos, &exc, &s,
3124 (PyObject **)&v, &outpos, &p))
3125 goto onError;
3126 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003127 nextByte:
3128 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003129 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00003130 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003131 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003132 Py_XDECREF(errorHandler);
3133 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003134 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003135
Guido van Rossumd57fd912000-03-10 22:53:23 +00003136 onError:
3137 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003138 Py_XDECREF(errorHandler);
3139 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003140 return NULL;
3141}
3142
3143PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003144 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003145{
3146 PyObject *repr;
3147 char *p;
3148 char *q;
3149
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003150 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003151
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003152#ifdef Py_UNICODE_WIDE
3153 repr = PyString_FromStringAndSize(NULL, 10 * size);
3154#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003155 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003156#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003157 if (repr == NULL)
3158 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003159 if (size == 0)
3160 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003161
3162 p = q = PyString_AS_STRING(repr);
3163 while (size-- > 0) {
3164 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003165#ifdef Py_UNICODE_WIDE
3166 /* Map 32-bit characters to '\Uxxxxxxxx' */
3167 if (ch >= 0x10000) {
3168 *p++ = '\\';
3169 *p++ = 'U';
3170 *p++ = hexdigit[(ch >> 28) & 0xf];
3171 *p++ = hexdigit[(ch >> 24) & 0xf];
3172 *p++ = hexdigit[(ch >> 20) & 0xf];
3173 *p++ = hexdigit[(ch >> 16) & 0xf];
3174 *p++ = hexdigit[(ch >> 12) & 0xf];
3175 *p++ = hexdigit[(ch >> 8) & 0xf];
3176 *p++ = hexdigit[(ch >> 4) & 0xf];
3177 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003178 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003179 else
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003180#else
3181 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3182 if (ch >= 0xD800 && ch < 0xDC00) {
3183 Py_UNICODE ch2;
3184 Py_UCS4 ucs;
3185
3186 ch2 = *s++;
3187 size--;
3188 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3189 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3190 *p++ = '\\';
3191 *p++ = 'U';
3192 *p++ = hexdigit[(ucs >> 28) & 0xf];
3193 *p++ = hexdigit[(ucs >> 24) & 0xf];
3194 *p++ = hexdigit[(ucs >> 20) & 0xf];
3195 *p++ = hexdigit[(ucs >> 16) & 0xf];
3196 *p++ = hexdigit[(ucs >> 12) & 0xf];
3197 *p++ = hexdigit[(ucs >> 8) & 0xf];
3198 *p++ = hexdigit[(ucs >> 4) & 0xf];
3199 *p++ = hexdigit[ucs & 0xf];
3200 continue;
3201 }
3202 /* Fall through: isolated surrogates are copied as-is */
3203 s--;
3204 size++;
3205 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003206#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003207 /* Map 16-bit characters to '\uxxxx' */
3208 if (ch >= 256) {
3209 *p++ = '\\';
3210 *p++ = 'u';
3211 *p++ = hexdigit[(ch >> 12) & 0xf];
3212 *p++ = hexdigit[(ch >> 8) & 0xf];
3213 *p++ = hexdigit[(ch >> 4) & 0xf];
3214 *p++ = hexdigit[ch & 15];
3215 }
3216 /* Copy everything else as-is */
3217 else
3218 *p++ = (char) ch;
3219 }
3220 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00003221 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003222 return repr;
3223}
3224
3225PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3226{
3227 if (!PyUnicode_Check(unicode)) {
3228 PyErr_BadArgument();
3229 return NULL;
3230 }
3231 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3232 PyUnicode_GET_SIZE(unicode));
3233}
3234
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003235/* --- Unicode Internal Codec ------------------------------------------- */
3236
3237PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003238 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003239 const char *errors)
3240{
3241 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003242 Py_ssize_t startinpos;
3243 Py_ssize_t endinpos;
3244 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003245 PyUnicodeObject *v;
3246 Py_UNICODE *p;
3247 const char *end;
3248 const char *reason;
3249 PyObject *errorHandler = NULL;
3250 PyObject *exc = NULL;
3251
Neal Norwitzd43069c2006-01-08 01:12:10 +00003252#ifdef Py_UNICODE_WIDE
3253 Py_UNICODE unimax = PyUnicode_GetMax();
3254#endif
3255
Armin Rigo7ccbca92006-10-04 12:17:45 +00003256 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003257 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3258 if (v == NULL)
3259 goto onError;
3260 if (PyUnicode_GetSize((PyObject *)v) == 0)
3261 return (PyObject *)v;
3262 p = PyUnicode_AS_UNICODE(v);
3263 end = s + size;
3264
3265 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00003266 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003267 /* We have to sanity check the raw data, otherwise doom looms for
3268 some malformed UCS-4 data. */
3269 if (
3270 #ifdef Py_UNICODE_WIDE
3271 *p > unimax || *p < 0 ||
3272 #endif
3273 end-s < Py_UNICODE_SIZE
3274 )
3275 {
3276 startinpos = s - starts;
3277 if (end-s < Py_UNICODE_SIZE) {
3278 endinpos = end-starts;
3279 reason = "truncated input";
3280 }
3281 else {
3282 endinpos = s - starts + Py_UNICODE_SIZE;
3283 reason = "illegal code point (> 0x10FFFF)";
3284 }
3285 outpos = p - PyUnicode_AS_UNICODE(v);
3286 if (unicode_decode_call_errorhandler(
3287 errors, &errorHandler,
3288 "unicode_internal", reason,
3289 starts, size, &startinpos, &endinpos, &exc, &s,
3290 (PyObject **)&v, &outpos, &p)) {
3291 goto onError;
3292 }
3293 }
3294 else {
3295 p++;
3296 s += Py_UNICODE_SIZE;
3297 }
3298 }
3299
Martin v. Löwis412fb672006-04-13 06:34:32 +00003300 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003301 goto onError;
3302 Py_XDECREF(errorHandler);
3303 Py_XDECREF(exc);
3304 return (PyObject *)v;
3305
3306 onError:
3307 Py_XDECREF(v);
3308 Py_XDECREF(errorHandler);
3309 Py_XDECREF(exc);
3310 return NULL;
3311}
3312
Guido van Rossumd57fd912000-03-10 22:53:23 +00003313/* --- Latin-1 Codec ------------------------------------------------------ */
3314
3315PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003316 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003317 const char *errors)
3318{
3319 PyUnicodeObject *v;
3320 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003321
Guido van Rossumd57fd912000-03-10 22:53:23 +00003322 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003323 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003324 Py_UNICODE r = *(unsigned char*)s;
3325 return PyUnicode_FromUnicode(&r, 1);
3326 }
3327
Guido van Rossumd57fd912000-03-10 22:53:23 +00003328 v = _PyUnicode_New(size);
3329 if (v == NULL)
3330 goto onError;
3331 if (size == 0)
3332 return (PyObject *)v;
3333 p = PyUnicode_AS_UNICODE(v);
3334 while (size-- > 0)
3335 *p++ = (unsigned char)*s++;
3336 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003337
Guido van Rossumd57fd912000-03-10 22:53:23 +00003338 onError:
3339 Py_XDECREF(v);
3340 return NULL;
3341}
3342
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003343/* create or adjust a UnicodeEncodeError */
3344static void make_encode_exception(PyObject **exceptionObject,
3345 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003346 const Py_UNICODE *unicode, Py_ssize_t size,
3347 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003348 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003349{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003350 if (*exceptionObject == NULL) {
3351 *exceptionObject = PyUnicodeEncodeError_Create(
3352 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003353 }
3354 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003355 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3356 goto onError;
3357 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3358 goto onError;
3359 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3360 goto onError;
3361 return;
3362 onError:
3363 Py_DECREF(*exceptionObject);
3364 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003365 }
3366}
3367
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003368/* raises a UnicodeEncodeError */
3369static void raise_encode_exception(PyObject **exceptionObject,
3370 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003371 const Py_UNICODE *unicode, Py_ssize_t size,
3372 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003373 const char *reason)
3374{
3375 make_encode_exception(exceptionObject,
3376 encoding, unicode, size, startpos, endpos, reason);
3377 if (*exceptionObject != NULL)
3378 PyCodec_StrictErrors(*exceptionObject);
3379}
3380
3381/* error handling callback helper:
3382 build arguments, call the callback and check the arguments,
3383 put the result into newpos and return the replacement string, which
3384 has to be freed by the caller */
3385static PyObject *unicode_encode_call_errorhandler(const char *errors,
3386 PyObject **errorHandler,
3387 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003388 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3389 Py_ssize_t startpos, Py_ssize_t endpos,
3390 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003391{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003392 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003393
3394 PyObject *restuple;
3395 PyObject *resunicode;
3396
3397 if (*errorHandler == NULL) {
3398 *errorHandler = PyCodec_LookupError(errors);
3399 if (*errorHandler == NULL)
3400 return NULL;
3401 }
3402
3403 make_encode_exception(exceptionObject,
3404 encoding, unicode, size, startpos, endpos, reason);
3405 if (*exceptionObject == NULL)
3406 return NULL;
3407
3408 restuple = PyObject_CallFunctionObjArgs(
3409 *errorHandler, *exceptionObject, NULL);
3410 if (restuple == NULL)
3411 return NULL;
3412 if (!PyTuple_Check(restuple)) {
3413 PyErr_Format(PyExc_TypeError, &argparse[4]);
3414 Py_DECREF(restuple);
3415 return NULL;
3416 }
3417 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3418 &resunicode, newpos)) {
3419 Py_DECREF(restuple);
3420 return NULL;
3421 }
3422 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003423 *newpos = size+*newpos;
3424 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003425 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003426 Py_DECREF(restuple);
3427 return NULL;
3428 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003429 Py_INCREF(resunicode);
3430 Py_DECREF(restuple);
3431 return resunicode;
3432}
3433
3434static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003435 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003436 const char *errors,
3437 int limit)
3438{
3439 /* output object */
3440 PyObject *res;
3441 /* pointers to the beginning and end+1 of input */
3442 const Py_UNICODE *startp = p;
3443 const Py_UNICODE *endp = p + size;
3444 /* pointer to the beginning of the unencodable characters */
3445 /* const Py_UNICODE *badp = NULL; */
3446 /* pointer into the output */
3447 char *str;
3448 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003449 Py_ssize_t respos = 0;
3450 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00003451 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3452 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003453 PyObject *errorHandler = NULL;
3454 PyObject *exc = NULL;
3455 /* the following variable is used for caching string comparisons
3456 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3457 int known_errorHandler = -1;
3458
3459 /* allocate enough for a simple encoding without
3460 replacements, if we need more, we'll resize */
3461 res = PyString_FromStringAndSize(NULL, size);
3462 if (res == NULL)
3463 goto onError;
3464 if (size == 0)
3465 return res;
3466 str = PyString_AS_STRING(res);
3467 ressize = size;
3468
3469 while (p<endp) {
3470 Py_UNICODE c = *p;
3471
3472 /* can we encode this? */
3473 if (c<limit) {
3474 /* no overflow check, because we know that the space is enough */
3475 *str++ = (char)c;
3476 ++p;
3477 }
3478 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003479 Py_ssize_t unicodepos = p-startp;
3480 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003481 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003482 Py_ssize_t repsize;
3483 Py_ssize_t newpos;
3484 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003485 Py_UNICODE *uni2;
3486 /* startpos for collecting unencodable chars */
3487 const Py_UNICODE *collstart = p;
3488 const Py_UNICODE *collend = p;
3489 /* find all unecodable characters */
3490 while ((collend < endp) && ((*collend)>=limit))
3491 ++collend;
3492 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3493 if (known_errorHandler==-1) {
3494 if ((errors==NULL) || (!strcmp(errors, "strict")))
3495 known_errorHandler = 1;
3496 else if (!strcmp(errors, "replace"))
3497 known_errorHandler = 2;
3498 else if (!strcmp(errors, "ignore"))
3499 known_errorHandler = 3;
3500 else if (!strcmp(errors, "xmlcharrefreplace"))
3501 known_errorHandler = 4;
3502 else
3503 known_errorHandler = 0;
3504 }
3505 switch (known_errorHandler) {
3506 case 1: /* strict */
3507 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3508 goto onError;
3509 case 2: /* replace */
3510 while (collstart++<collend)
3511 *str++ = '?'; /* fall through */
3512 case 3: /* ignore */
3513 p = collend;
3514 break;
3515 case 4: /* xmlcharrefreplace */
3516 respos = str-PyString_AS_STRING(res);
3517 /* determine replacement size (temporarily (mis)uses p) */
3518 for (p = collstart, repsize = 0; p < collend; ++p) {
3519 if (*p<10)
3520 repsize += 2+1+1;
3521 else if (*p<100)
3522 repsize += 2+2+1;
3523 else if (*p<1000)
3524 repsize += 2+3+1;
3525 else if (*p<10000)
3526 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003527#ifndef Py_UNICODE_WIDE
3528 else
3529 repsize += 2+5+1;
3530#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003531 else if (*p<100000)
3532 repsize += 2+5+1;
3533 else if (*p<1000000)
3534 repsize += 2+6+1;
3535 else
3536 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003537#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003538 }
3539 requiredsize = respos+repsize+(endp-collend);
3540 if (requiredsize > ressize) {
3541 if (requiredsize<2*ressize)
3542 requiredsize = 2*ressize;
3543 if (_PyString_Resize(&res, requiredsize))
3544 goto onError;
3545 str = PyString_AS_STRING(res) + respos;
3546 ressize = requiredsize;
3547 }
3548 /* generate replacement (temporarily (mis)uses p) */
3549 for (p = collstart; p < collend; ++p) {
3550 str += sprintf(str, "&#%d;", (int)*p);
3551 }
3552 p = collend;
3553 break;
3554 default:
3555 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3556 encoding, reason, startp, size, &exc,
3557 collstart-startp, collend-startp, &newpos);
3558 if (repunicode == NULL)
3559 goto onError;
3560 /* need more space? (at least enough for what we
3561 have+the replacement+the rest of the string, so
3562 we won't have to check space for encodable characters) */
3563 respos = str-PyString_AS_STRING(res);
3564 repsize = PyUnicode_GET_SIZE(repunicode);
3565 requiredsize = respos+repsize+(endp-collend);
3566 if (requiredsize > ressize) {
3567 if (requiredsize<2*ressize)
3568 requiredsize = 2*ressize;
3569 if (_PyString_Resize(&res, requiredsize)) {
3570 Py_DECREF(repunicode);
3571 goto onError;
3572 }
3573 str = PyString_AS_STRING(res) + respos;
3574 ressize = requiredsize;
3575 }
3576 /* check if there is anything unencodable in the replacement
3577 and copy it to the output */
3578 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3579 c = *uni2;
3580 if (c >= limit) {
3581 raise_encode_exception(&exc, encoding, startp, size,
3582 unicodepos, unicodepos+1, reason);
3583 Py_DECREF(repunicode);
3584 goto onError;
3585 }
3586 *str = (char)c;
3587 }
3588 p = startp + newpos;
3589 Py_DECREF(repunicode);
3590 }
3591 }
3592 }
3593 /* Resize if we allocated to much */
3594 respos = str-PyString_AS_STRING(res);
3595 if (respos<ressize)
3596 /* If this falls res will be NULL */
3597 _PyString_Resize(&res, respos);
3598 Py_XDECREF(errorHandler);
3599 Py_XDECREF(exc);
3600 return res;
3601
3602 onError:
3603 Py_XDECREF(res);
3604 Py_XDECREF(errorHandler);
3605 Py_XDECREF(exc);
3606 return NULL;
3607}
3608
Guido van Rossumd57fd912000-03-10 22:53:23 +00003609PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003610 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003611 const char *errors)
3612{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003613 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003614}
3615
3616PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3617{
3618 if (!PyUnicode_Check(unicode)) {
3619 PyErr_BadArgument();
3620 return NULL;
3621 }
3622 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3623 PyUnicode_GET_SIZE(unicode),
3624 NULL);
3625}
3626
3627/* --- 7-bit ASCII Codec -------------------------------------------------- */
3628
Guido van Rossumd57fd912000-03-10 22:53:23 +00003629PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003630 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003631 const char *errors)
3632{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003633 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003634 PyUnicodeObject *v;
3635 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003636 Py_ssize_t startinpos;
3637 Py_ssize_t endinpos;
3638 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003639 const char *e;
3640 PyObject *errorHandler = NULL;
3641 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003642
Guido van Rossumd57fd912000-03-10 22:53:23 +00003643 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003644 if (size == 1 && *(unsigned char*)s < 128) {
3645 Py_UNICODE r = *(unsigned char*)s;
3646 return PyUnicode_FromUnicode(&r, 1);
3647 }
Tim Petersced69f82003-09-16 20:30:58 +00003648
Guido van Rossumd57fd912000-03-10 22:53:23 +00003649 v = _PyUnicode_New(size);
3650 if (v == NULL)
3651 goto onError;
3652 if (size == 0)
3653 return (PyObject *)v;
3654 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003655 e = s + size;
3656 while (s < e) {
3657 register unsigned char c = (unsigned char)*s;
3658 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003659 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003660 ++s;
3661 }
3662 else {
3663 startinpos = s-starts;
3664 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003665 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003666 if (unicode_decode_call_errorhandler(
3667 errors, &errorHandler,
3668 "ascii", "ordinal not in range(128)",
3669 starts, size, &startinpos, &endinpos, &exc, &s,
3670 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003671 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003672 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003673 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003674 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00003675 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003676 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003677 Py_XDECREF(errorHandler);
3678 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003679 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003680
Guido van Rossumd57fd912000-03-10 22:53:23 +00003681 onError:
3682 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003683 Py_XDECREF(errorHandler);
3684 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003685 return NULL;
3686}
3687
Guido van Rossumd57fd912000-03-10 22:53:23 +00003688PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003689 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003690 const char *errors)
3691{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003692 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003693}
3694
3695PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3696{
3697 if (!PyUnicode_Check(unicode)) {
3698 PyErr_BadArgument();
3699 return NULL;
3700 }
3701 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3702 PyUnicode_GET_SIZE(unicode),
3703 NULL);
3704}
3705
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003706#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003707
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003708/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003709
Martin v. Löwisd8251432006-06-14 05:21:04 +00003710#if SIZEOF_INT < SIZEOF_SSIZE_T
3711#define NEED_RETRY
3712#endif
3713
3714/* XXX This code is limited to "true" double-byte encodings, as
3715 a) it assumes an incomplete character consists of a single byte, and
3716 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3717 encodings, see IsDBCSLeadByteEx documentation. */
3718
3719static int is_dbcs_lead_byte(const char *s, int offset)
3720{
3721 const char *curr = s + offset;
3722
3723 if (IsDBCSLeadByte(*curr)) {
3724 const char *prev = CharPrev(s, curr);
3725 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3726 }
3727 return 0;
3728}
3729
3730/*
3731 * Decode MBCS string into unicode object. If 'final' is set, converts
3732 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3733 */
3734static int decode_mbcs(PyUnicodeObject **v,
3735 const char *s, /* MBCS string */
3736 int size, /* sizeof MBCS string */
3737 int final)
3738{
3739 Py_UNICODE *p;
3740 Py_ssize_t n = 0;
3741 int usize = 0;
3742
3743 assert(size >= 0);
3744
3745 /* Skip trailing lead-byte unless 'final' is set */
3746 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3747 --size;
3748
3749 /* First get the size of the result */
3750 if (size > 0) {
3751 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3752 if (usize == 0) {
3753 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3754 return -1;
3755 }
3756 }
3757
3758 if (*v == NULL) {
3759 /* Create unicode object */
3760 *v = _PyUnicode_New(usize);
3761 if (*v == NULL)
3762 return -1;
3763 }
3764 else {
3765 /* Extend unicode object */
3766 n = PyUnicode_GET_SIZE(*v);
3767 if (_PyUnicode_Resize(v, n + usize) < 0)
3768 return -1;
3769 }
3770
3771 /* Do the conversion */
3772 if (size > 0) {
3773 p = PyUnicode_AS_UNICODE(*v) + n;
3774 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3775 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3776 return -1;
3777 }
3778 }
3779
3780 return size;
3781}
3782
3783PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3784 Py_ssize_t size,
3785 const char *errors,
3786 Py_ssize_t *consumed)
3787{
3788 PyUnicodeObject *v = NULL;
3789 int done;
3790
3791 if (consumed)
3792 *consumed = 0;
3793
3794#ifdef NEED_RETRY
3795 retry:
3796 if (size > INT_MAX)
3797 done = decode_mbcs(&v, s, INT_MAX, 0);
3798 else
3799#endif
3800 done = decode_mbcs(&v, s, (int)size, !consumed);
3801
3802 if (done < 0) {
3803 Py_XDECREF(v);
3804 return NULL;
3805 }
3806
3807 if (consumed)
3808 *consumed += done;
3809
3810#ifdef NEED_RETRY
3811 if (size > INT_MAX) {
3812 s += done;
3813 size -= done;
3814 goto retry;
3815 }
3816#endif
3817
3818 return (PyObject *)v;
3819}
3820
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003821PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003822 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003823 const char *errors)
3824{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003825 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3826}
3827
3828/*
3829 * Convert unicode into string object (MBCS).
3830 * Returns 0 if succeed, -1 otherwise.
3831 */
3832static int encode_mbcs(PyObject **repr,
3833 const Py_UNICODE *p, /* unicode */
3834 int size) /* size of unicode */
3835{
3836 int mbcssize = 0;
3837 Py_ssize_t n = 0;
3838
3839 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003840
3841 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003842 if (size > 0) {
3843 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3844 if (mbcssize == 0) {
3845 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3846 return -1;
3847 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003848 }
3849
Martin v. Löwisd8251432006-06-14 05:21:04 +00003850 if (*repr == NULL) {
3851 /* Create string object */
3852 *repr = PyString_FromStringAndSize(NULL, mbcssize);
3853 if (*repr == NULL)
3854 return -1;
3855 }
3856 else {
3857 /* Extend string object */
3858 n = PyString_Size(*repr);
3859 if (_PyString_Resize(repr, n + mbcssize) < 0)
3860 return -1;
3861 }
3862
3863 /* Do the conversion */
3864 if (size > 0) {
3865 char *s = PyString_AS_STRING(*repr) + n;
3866 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3867 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3868 return -1;
3869 }
3870 }
3871
3872 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003873}
3874
3875PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003876 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003877 const char *errors)
3878{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003879 PyObject *repr = NULL;
3880 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003881
Martin v. Löwisd8251432006-06-14 05:21:04 +00003882#ifdef NEED_RETRY
3883 retry:
3884 if (size > INT_MAX)
3885 ret = encode_mbcs(&repr, p, INT_MAX);
3886 else
3887#endif
3888 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003889
Martin v. Löwisd8251432006-06-14 05:21:04 +00003890 if (ret < 0) {
3891 Py_XDECREF(repr);
3892 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003893 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003894
3895#ifdef NEED_RETRY
3896 if (size > INT_MAX) {
3897 p += INT_MAX;
3898 size -= INT_MAX;
3899 goto retry;
3900 }
3901#endif
3902
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003903 return repr;
3904}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003905
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003906PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3907{
3908 if (!PyUnicode_Check(unicode)) {
3909 PyErr_BadArgument();
3910 return NULL;
3911 }
3912 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3913 PyUnicode_GET_SIZE(unicode),
3914 NULL);
3915}
3916
Martin v. Löwisd8251432006-06-14 05:21:04 +00003917#undef NEED_RETRY
3918
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003919#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003920
Guido van Rossumd57fd912000-03-10 22:53:23 +00003921/* --- Character Mapping Codec -------------------------------------------- */
3922
Guido van Rossumd57fd912000-03-10 22:53:23 +00003923PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003924 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003925 PyObject *mapping,
3926 const char *errors)
3927{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003928 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003929 Py_ssize_t startinpos;
3930 Py_ssize_t endinpos;
3931 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003932 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003933 PyUnicodeObject *v;
3934 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003935 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003936 PyObject *errorHandler = NULL;
3937 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003938 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003939 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003940
Guido van Rossumd57fd912000-03-10 22:53:23 +00003941 /* Default to Latin-1 */
3942 if (mapping == NULL)
3943 return PyUnicode_DecodeLatin1(s, size, errors);
3944
3945 v = _PyUnicode_New(size);
3946 if (v == NULL)
3947 goto onError;
3948 if (size == 0)
3949 return (PyObject *)v;
3950 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003951 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003952 if (PyUnicode_CheckExact(mapping)) {
3953 mapstring = PyUnicode_AS_UNICODE(mapping);
3954 maplen = PyUnicode_GET_SIZE(mapping);
3955 while (s < e) {
3956 unsigned char ch = *s;
3957 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003958
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003959 if (ch < maplen)
3960 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003961
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003962 if (x == 0xfffe) {
3963 /* undefined mapping */
3964 outpos = p-PyUnicode_AS_UNICODE(v);
3965 startinpos = s-starts;
3966 endinpos = startinpos+1;
3967 if (unicode_decode_call_errorhandler(
3968 errors, &errorHandler,
3969 "charmap", "character maps to <undefined>",
3970 starts, size, &startinpos, &endinpos, &exc, &s,
3971 (PyObject **)&v, &outpos, &p)) {
3972 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003973 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003974 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003975 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003976 *p++ = x;
3977 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003978 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003979 }
3980 else {
3981 while (s < e) {
3982 unsigned char ch = *s;
3983 PyObject *w, *x;
3984
3985 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3986 w = PyInt_FromLong((long)ch);
3987 if (w == NULL)
3988 goto onError;
3989 x = PyObject_GetItem(mapping, w);
3990 Py_DECREF(w);
3991 if (x == NULL) {
3992 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3993 /* No mapping found means: mapping is undefined. */
3994 PyErr_Clear();
3995 x = Py_None;
3996 Py_INCREF(x);
3997 } else
3998 goto onError;
3999 }
4000
4001 /* Apply mapping */
4002 if (PyInt_Check(x)) {
4003 long value = PyInt_AS_LONG(x);
4004 if (value < 0 || value > 65535) {
4005 PyErr_SetString(PyExc_TypeError,
4006 "character mapping must be in range(65536)");
4007 Py_DECREF(x);
4008 goto onError;
4009 }
4010 *p++ = (Py_UNICODE)value;
4011 }
4012 else if (x == Py_None) {
4013 /* undefined mapping */
4014 outpos = p-PyUnicode_AS_UNICODE(v);
4015 startinpos = s-starts;
4016 endinpos = startinpos+1;
4017 if (unicode_decode_call_errorhandler(
4018 errors, &errorHandler,
4019 "charmap", "character maps to <undefined>",
4020 starts, size, &startinpos, &endinpos, &exc, &s,
4021 (PyObject **)&v, &outpos, &p)) {
4022 Py_DECREF(x);
4023 goto onError;
4024 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00004025 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004026 continue;
4027 }
4028 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004029 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004030
4031 if (targetsize == 1)
4032 /* 1-1 mapping */
4033 *p++ = *PyUnicode_AS_UNICODE(x);
4034
4035 else if (targetsize > 1) {
4036 /* 1-n mapping */
4037 if (targetsize > extrachars) {
4038 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004039 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4040 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004041 (targetsize << 2);
4042 extrachars += needed;
Armin Rigo7ccbca92006-10-04 12:17:45 +00004043 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004044 if (_PyUnicode_Resize(&v,
4045 PyUnicode_GET_SIZE(v) + needed) < 0) {
4046 Py_DECREF(x);
4047 goto onError;
4048 }
4049 p = PyUnicode_AS_UNICODE(v) + oldpos;
4050 }
4051 Py_UNICODE_COPY(p,
4052 PyUnicode_AS_UNICODE(x),
4053 targetsize);
4054 p += targetsize;
4055 extrachars -= targetsize;
4056 }
4057 /* 1-0 mapping: skip the character */
4058 }
4059 else {
4060 /* wrong return value */
4061 PyErr_SetString(PyExc_TypeError,
4062 "character mapping must return integer, None or unicode");
4063 Py_DECREF(x);
4064 goto onError;
4065 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004066 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004067 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004068 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004069 }
4070 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00004071 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004072 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004073 Py_XDECREF(errorHandler);
4074 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004075 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004076
Guido van Rossumd57fd912000-03-10 22:53:23 +00004077 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004078 Py_XDECREF(errorHandler);
4079 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004080 Py_XDECREF(v);
4081 return NULL;
4082}
4083
Martin v. Löwis3f767792006-06-04 19:36:28 +00004084/* Charmap encoding: the lookup table */
4085
4086struct encoding_map{
4087 PyObject_HEAD
4088 unsigned char level1[32];
4089 int count2, count3;
4090 unsigned char level23[1];
4091};
4092
4093static PyObject*
4094encoding_map_size(PyObject *obj, PyObject* args)
4095{
4096 struct encoding_map *map = (struct encoding_map*)obj;
4097 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4098 128*map->count3);
4099}
4100
4101static PyMethodDef encoding_map_methods[] = {
4102 {"size", encoding_map_size, METH_NOARGS,
4103 PyDoc_STR("Return the size (in bytes) of this object") },
4104 { 0 }
4105};
4106
4107static void
4108encoding_map_dealloc(PyObject* o)
4109{
4110 PyObject_FREE(o);
4111}
4112
4113static PyTypeObject EncodingMapType = {
Martin v. Löwis68192102007-07-21 06:55:02 +00004114 PyVarObject_HEAD_INIT(NULL, 0)
Martin v. Löwis3f767792006-06-04 19:36:28 +00004115 "EncodingMap", /*tp_name*/
4116 sizeof(struct encoding_map), /*tp_basicsize*/
4117 0, /*tp_itemsize*/
4118 /* methods */
4119 encoding_map_dealloc, /*tp_dealloc*/
4120 0, /*tp_print*/
4121 0, /*tp_getattr*/
4122 0, /*tp_setattr*/
4123 0, /*tp_compare*/
4124 0, /*tp_repr*/
4125 0, /*tp_as_number*/
4126 0, /*tp_as_sequence*/
4127 0, /*tp_as_mapping*/
4128 0, /*tp_hash*/
4129 0, /*tp_call*/
4130 0, /*tp_str*/
4131 0, /*tp_getattro*/
4132 0, /*tp_setattro*/
4133 0, /*tp_as_buffer*/
4134 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4135 0, /*tp_doc*/
4136 0, /*tp_traverse*/
4137 0, /*tp_clear*/
4138 0, /*tp_richcompare*/
4139 0, /*tp_weaklistoffset*/
4140 0, /*tp_iter*/
4141 0, /*tp_iternext*/
4142 encoding_map_methods, /*tp_methods*/
4143 0, /*tp_members*/
4144 0, /*tp_getset*/
4145 0, /*tp_base*/
4146 0, /*tp_dict*/
4147 0, /*tp_descr_get*/
4148 0, /*tp_descr_set*/
4149 0, /*tp_dictoffset*/
4150 0, /*tp_init*/
4151 0, /*tp_alloc*/
4152 0, /*tp_new*/
4153 0, /*tp_free*/
4154 0, /*tp_is_gc*/
4155};
4156
4157PyObject*
4158PyUnicode_BuildEncodingMap(PyObject* string)
4159{
4160 Py_UNICODE *decode;
4161 PyObject *result;
4162 struct encoding_map *mresult;
4163 int i;
4164 int need_dict = 0;
4165 unsigned char level1[32];
4166 unsigned char level2[512];
4167 unsigned char *mlevel1, *mlevel2, *mlevel3;
4168 int count2 = 0, count3 = 0;
4169
4170 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4171 PyErr_BadArgument();
4172 return NULL;
4173 }
4174 decode = PyUnicode_AS_UNICODE(string);
4175 memset(level1, 0xFF, sizeof level1);
4176 memset(level2, 0xFF, sizeof level2);
4177
4178 /* If there isn't a one-to-one mapping of NULL to \0,
4179 or if there are non-BMP characters, we need to use
4180 a mapping dictionary. */
4181 if (decode[0] != 0)
4182 need_dict = 1;
4183 for (i = 1; i < 256; i++) {
4184 int l1, l2;
4185 if (decode[i] == 0
4186 #ifdef Py_UNICODE_WIDE
4187 || decode[i] > 0xFFFF
4188 #endif
4189 ) {
4190 need_dict = 1;
4191 break;
4192 }
4193 if (decode[i] == 0xFFFE)
4194 /* unmapped character */
4195 continue;
4196 l1 = decode[i] >> 11;
4197 l2 = decode[i] >> 7;
4198 if (level1[l1] == 0xFF)
4199 level1[l1] = count2++;
4200 if (level2[l2] == 0xFF)
4201 level2[l2] = count3++;
4202 }
4203
4204 if (count2 >= 0xFF || count3 >= 0xFF)
4205 need_dict = 1;
4206
4207 if (need_dict) {
4208 PyObject *result = PyDict_New();
4209 PyObject *key, *value;
4210 if (!result)
4211 return NULL;
4212 for (i = 0; i < 256; i++) {
4213 key = value = NULL;
4214 key = PyInt_FromLong(decode[i]);
4215 value = PyInt_FromLong(i);
4216 if (!key || !value)
4217 goto failed1;
4218 if (PyDict_SetItem(result, key, value) == -1)
4219 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00004220 Py_DECREF(key);
4221 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004222 }
4223 return result;
4224 failed1:
4225 Py_XDECREF(key);
4226 Py_XDECREF(value);
4227 Py_DECREF(result);
4228 return NULL;
4229 }
4230
4231 /* Create a three-level trie */
4232 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4233 16*count2 + 128*count3 - 1);
4234 if (!result)
4235 return PyErr_NoMemory();
4236 PyObject_Init(result, &EncodingMapType);
4237 mresult = (struct encoding_map*)result;
4238 mresult->count2 = count2;
4239 mresult->count3 = count3;
4240 mlevel1 = mresult->level1;
4241 mlevel2 = mresult->level23;
4242 mlevel3 = mresult->level23 + 16*count2;
4243 memcpy(mlevel1, level1, 32);
4244 memset(mlevel2, 0xFF, 16*count2);
4245 memset(mlevel3, 0, 128*count3);
4246 count3 = 0;
4247 for (i = 1; i < 256; i++) {
4248 int o1, o2, o3, i2, i3;
4249 if (decode[i] == 0xFFFE)
4250 /* unmapped character */
4251 continue;
4252 o1 = decode[i]>>11;
4253 o2 = (decode[i]>>7) & 0xF;
4254 i2 = 16*mlevel1[o1] + o2;
4255 if (mlevel2[i2] == 0xFF)
4256 mlevel2[i2] = count3++;
4257 o3 = decode[i] & 0x7F;
4258 i3 = 128*mlevel2[i2] + o3;
4259 mlevel3[i3] = i;
4260 }
4261 return result;
4262}
4263
4264static int
4265encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4266{
4267 struct encoding_map *map = (struct encoding_map*)mapping;
4268 int l1 = c>>11;
4269 int l2 = (c>>7) & 0xF;
4270 int l3 = c & 0x7F;
4271 int i;
4272
4273#ifdef Py_UNICODE_WIDE
4274 if (c > 0xFFFF) {
4275 return -1;
4276 }
4277#endif
4278 if (c == 0)
4279 return 0;
4280 /* level 1*/
4281 i = map->level1[l1];
4282 if (i == 0xFF) {
4283 return -1;
4284 }
4285 /* level 2*/
4286 i = map->level23[16*i+l2];
4287 if (i == 0xFF) {
4288 return -1;
4289 }
4290 /* level 3 */
4291 i = map->level23[16*map->count2 + 128*i + l3];
4292 if (i == 0) {
4293 return -1;
4294 }
4295 return i;
4296}
4297
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004298/* Lookup the character ch in the mapping. If the character
4299 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004300 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004301static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004302{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004303 PyObject *w = PyInt_FromLong((long)c);
4304 PyObject *x;
4305
4306 if (w == NULL)
4307 return NULL;
4308 x = PyObject_GetItem(mapping, w);
4309 Py_DECREF(w);
4310 if (x == NULL) {
4311 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4312 /* No mapping found means: mapping is undefined. */
4313 PyErr_Clear();
4314 x = Py_None;
4315 Py_INCREF(x);
4316 return x;
4317 } else
4318 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004319 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004320 else if (x == Py_None)
4321 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004322 else if (PyInt_Check(x)) {
4323 long value = PyInt_AS_LONG(x);
4324 if (value < 0 || value > 255) {
4325 PyErr_SetString(PyExc_TypeError,
4326 "character mapping must be in range(256)");
4327 Py_DECREF(x);
4328 return NULL;
4329 }
4330 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004331 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004332 else if (PyString_Check(x))
4333 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004334 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004335 /* wrong return value */
4336 PyErr_SetString(PyExc_TypeError,
4337 "character mapping must return integer, None or str");
4338 Py_DECREF(x);
4339 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004340 }
4341}
4342
Martin v. Löwis3f767792006-06-04 19:36:28 +00004343static int
4344charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4345{
4346 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4347 /* exponentially overallocate to minimize reallocations */
4348 if (requiredsize < 2*outsize)
4349 requiredsize = 2*outsize;
4350 if (_PyString_Resize(outobj, requiredsize)) {
4351 return 0;
4352 }
4353 return 1;
4354}
4355
4356typedef enum charmapencode_result {
4357 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4358}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004359/* lookup the character, put the result in the output string and adjust
4360 various state variables. Reallocate the output string if not enough
4361 space is available. Return a new reference to the object that
4362 was put in the output buffer, or Py_None, if the mapping was undefined
4363 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004364 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004365static
Martin v. Löwis3f767792006-06-04 19:36:28 +00004366charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004367 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004368{
Martin v. Löwis3f767792006-06-04 19:36:28 +00004369 PyObject *rep;
4370 char *outstart;
4371 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004372
Christian Heimese93237d2007-12-19 02:37:44 +00004373 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004374 int res = encoding_map_lookup(c, mapping);
4375 Py_ssize_t requiredsize = *outpos+1;
4376 if (res == -1)
4377 return enc_FAILED;
4378 if (outsize<requiredsize)
4379 if (!charmapencode_resize(outobj, outpos, requiredsize))
4380 return enc_EXCEPTION;
4381 outstart = PyString_AS_STRING(*outobj);
4382 outstart[(*outpos)++] = (char)res;
4383 return enc_SUCCESS;
4384 }
4385
4386 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004387 if (rep==NULL)
Martin v. Löwis3f767792006-06-04 19:36:28 +00004388 return enc_EXCEPTION;
4389 else if (rep==Py_None) {
4390 Py_DECREF(rep);
4391 return enc_FAILED;
4392 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004393 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004394 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004395 if (outsize<requiredsize)
4396 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004397 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004398 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004399 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004400 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004401 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4402 }
4403 else {
4404 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004405 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4406 Py_ssize_t requiredsize = *outpos+repsize;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004407 if (outsize<requiredsize)
4408 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004409 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004410 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004411 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004412 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004413 memcpy(outstart + *outpos, repchars, repsize);
4414 *outpos += repsize;
4415 }
4416 }
Georg Brandl9f167602006-06-04 21:46:16 +00004417 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004418 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004419}
4420
4421/* handle an error in PyUnicode_EncodeCharmap
4422 Return 0 on success, -1 on error */
4423static
4424int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004425 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004426 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004427 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004428 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004429{
4430 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004431 Py_ssize_t repsize;
4432 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004433 Py_UNICODE *uni2;
4434 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004435 Py_ssize_t collstartpos = *inpos;
4436 Py_ssize_t collendpos = *inpos+1;
4437 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004438 char *encoding = "charmap";
4439 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00004440 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004441
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004442 /* find all unencodable characters */
4443 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004444 PyObject *rep;
Christian Heimese93237d2007-12-19 02:37:44 +00004445 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004446 int res = encoding_map_lookup(p[collendpos], mapping);
4447 if (res != -1)
4448 break;
4449 ++collendpos;
4450 continue;
4451 }
4452
4453 rep = charmapencode_lookup(p[collendpos], mapping);
4454 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004455 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004456 else if (rep!=Py_None) {
4457 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004458 break;
4459 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004460 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004461 ++collendpos;
4462 }
4463 /* cache callback name lookup
4464 * (if not done yet, i.e. it's the first error) */
4465 if (*known_errorHandler==-1) {
4466 if ((errors==NULL) || (!strcmp(errors, "strict")))
4467 *known_errorHandler = 1;
4468 else if (!strcmp(errors, "replace"))
4469 *known_errorHandler = 2;
4470 else if (!strcmp(errors, "ignore"))
4471 *known_errorHandler = 3;
4472 else if (!strcmp(errors, "xmlcharrefreplace"))
4473 *known_errorHandler = 4;
4474 else
4475 *known_errorHandler = 0;
4476 }
4477 switch (*known_errorHandler) {
4478 case 1: /* strict */
4479 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4480 return -1;
4481 case 2: /* replace */
4482 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4483 x = charmapencode_output('?', mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004484 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004485 return -1;
4486 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004487 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004488 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4489 return -1;
4490 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004491 }
4492 /* fall through */
4493 case 3: /* ignore */
4494 *inpos = collendpos;
4495 break;
4496 case 4: /* xmlcharrefreplace */
4497 /* generate replacement (temporarily (mis)uses p) */
4498 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4499 char buffer[2+29+1+1];
4500 char *cp;
4501 sprintf(buffer, "&#%d;", (int)p[collpos]);
4502 for (cp = buffer; *cp; ++cp) {
4503 x = charmapencode_output(*cp, mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004504 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004505 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004506 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004507 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4508 return -1;
4509 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004510 }
4511 }
4512 *inpos = collendpos;
4513 break;
4514 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004515 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004516 encoding, reason, p, size, exceptionObject,
4517 collstartpos, collendpos, &newpos);
4518 if (repunicode == NULL)
4519 return -1;
4520 /* generate replacement */
4521 repsize = PyUnicode_GET_SIZE(repunicode);
4522 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4523 x = charmapencode_output(*uni2, mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004524 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004525 return -1;
4526 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004527 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004528 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004529 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4530 return -1;
4531 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004532 }
4533 *inpos = newpos;
4534 Py_DECREF(repunicode);
4535 }
4536 return 0;
4537}
4538
Guido van Rossumd57fd912000-03-10 22:53:23 +00004539PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004540 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004541 PyObject *mapping,
4542 const char *errors)
4543{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004544 /* output object */
4545 PyObject *res = NULL;
4546 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004547 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004548 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004549 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004550 PyObject *errorHandler = NULL;
4551 PyObject *exc = NULL;
4552 /* the following variable is used for caching string comparisons
4553 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4554 * 3=ignore, 4=xmlcharrefreplace */
4555 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004556
4557 /* Default to Latin-1 */
4558 if (mapping == NULL)
4559 return PyUnicode_EncodeLatin1(p, size, errors);
4560
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004561 /* allocate enough for a simple encoding without
4562 replacements, if we need more, we'll resize */
4563 res = PyString_FromStringAndSize(NULL, size);
4564 if (res == NULL)
4565 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004566 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004567 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004568
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004569 while (inpos<size) {
4570 /* try to encode it */
Martin v. Löwis3f767792006-06-04 19:36:28 +00004571 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4572 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004573 goto onError;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004574 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004575 if (charmap_encoding_error(p, size, &inpos, mapping,
4576 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004577 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00004578 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004579 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004580 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004581 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004582 else
4583 /* done with this character => adjust input position */
4584 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004585 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004586
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004587 /* Resize if we allocated to much */
4588 if (respos<PyString_GET_SIZE(res)) {
4589 if (_PyString_Resize(&res, respos))
4590 goto onError;
4591 }
4592 Py_XDECREF(exc);
4593 Py_XDECREF(errorHandler);
4594 return res;
4595
4596 onError:
4597 Py_XDECREF(res);
4598 Py_XDECREF(exc);
4599 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004600 return NULL;
4601}
4602
4603PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4604 PyObject *mapping)
4605{
4606 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4607 PyErr_BadArgument();
4608 return NULL;
4609 }
4610 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4611 PyUnicode_GET_SIZE(unicode),
4612 mapping,
4613 NULL);
4614}
4615
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004616/* create or adjust a UnicodeTranslateError */
4617static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004618 const Py_UNICODE *unicode, Py_ssize_t size,
4619 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004620 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004621{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004622 if (*exceptionObject == NULL) {
4623 *exceptionObject = PyUnicodeTranslateError_Create(
4624 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004625 }
4626 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004627 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4628 goto onError;
4629 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4630 goto onError;
4631 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4632 goto onError;
4633 return;
4634 onError:
4635 Py_DECREF(*exceptionObject);
4636 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004637 }
4638}
4639
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004640/* raises a UnicodeTranslateError */
4641static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004642 const Py_UNICODE *unicode, Py_ssize_t size,
4643 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004644 const char *reason)
4645{
4646 make_translate_exception(exceptionObject,
4647 unicode, size, startpos, endpos, reason);
4648 if (*exceptionObject != NULL)
4649 PyCodec_StrictErrors(*exceptionObject);
4650}
4651
4652/* error handling callback helper:
4653 build arguments, call the callback and check the arguments,
4654 put the result into newpos and return the replacement string, which
4655 has to be freed by the caller */
4656static PyObject *unicode_translate_call_errorhandler(const char *errors,
4657 PyObject **errorHandler,
4658 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004659 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4660 Py_ssize_t startpos, Py_ssize_t endpos,
4661 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004662{
Martin v. Löwis412fb672006-04-13 06:34:32 +00004663 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004664
Martin v. Löwis412fb672006-04-13 06:34:32 +00004665 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004666 PyObject *restuple;
4667 PyObject *resunicode;
4668
4669 if (*errorHandler == NULL) {
4670 *errorHandler = PyCodec_LookupError(errors);
4671 if (*errorHandler == NULL)
4672 return NULL;
4673 }
4674
4675 make_translate_exception(exceptionObject,
4676 unicode, size, startpos, endpos, reason);
4677 if (*exceptionObject == NULL)
4678 return NULL;
4679
4680 restuple = PyObject_CallFunctionObjArgs(
4681 *errorHandler, *exceptionObject, NULL);
4682 if (restuple == NULL)
4683 return NULL;
4684 if (!PyTuple_Check(restuple)) {
4685 PyErr_Format(PyExc_TypeError, &argparse[4]);
4686 Py_DECREF(restuple);
4687 return NULL;
4688 }
4689 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004690 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004691 Py_DECREF(restuple);
4692 return NULL;
4693 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004694 if (i_newpos<0)
4695 *newpos = size+i_newpos;
4696 else
4697 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004698 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004699 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004700 Py_DECREF(restuple);
4701 return NULL;
4702 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004703 Py_INCREF(resunicode);
4704 Py_DECREF(restuple);
4705 return resunicode;
4706}
4707
4708/* Lookup the character ch in the mapping and put the result in result,
4709 which must be decrefed by the caller.
4710 Return 0 on success, -1 on error */
4711static
4712int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4713{
4714 PyObject *w = PyInt_FromLong((long)c);
4715 PyObject *x;
4716
4717 if (w == NULL)
4718 return -1;
4719 x = PyObject_GetItem(mapping, w);
4720 Py_DECREF(w);
4721 if (x == NULL) {
4722 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4723 /* No mapping found means: use 1:1 mapping. */
4724 PyErr_Clear();
4725 *result = NULL;
4726 return 0;
4727 } else
4728 return -1;
4729 }
4730 else if (x == Py_None) {
4731 *result = x;
4732 return 0;
4733 }
4734 else if (PyInt_Check(x)) {
4735 long value = PyInt_AS_LONG(x);
4736 long max = PyUnicode_GetMax();
4737 if (value < 0 || value > max) {
4738 PyErr_Format(PyExc_TypeError,
4739 "character mapping must be in range(0x%lx)", max+1);
4740 Py_DECREF(x);
4741 return -1;
4742 }
4743 *result = x;
4744 return 0;
4745 }
4746 else if (PyUnicode_Check(x)) {
4747 *result = x;
4748 return 0;
4749 }
4750 else {
4751 /* wrong return value */
4752 PyErr_SetString(PyExc_TypeError,
4753 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004754 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004755 return -1;
4756 }
4757}
4758/* ensure that *outobj is at least requiredsize characters long,
4759if not reallocate and adjust various state variables.
4760Return 0 on success, -1 on error */
4761static
Walter Dörwald4894c302003-10-24 14:25:28 +00004762int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004763 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004764{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004765 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004766 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004767 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004768 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004769 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004770 if (requiredsize < 2 * oldsize)
4771 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004772 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004773 return -1;
4774 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004775 }
4776 return 0;
4777}
4778/* lookup the character, put the result in the output string and adjust
4779 various state variables. Return a new reference to the object that
4780 was put in the output buffer in *result, or Py_None, if the mapping was
4781 undefined (in which case no character was written).
4782 The called must decref result.
4783 Return 0 on success, -1 on error. */
4784static
Walter Dörwald4894c302003-10-24 14:25:28 +00004785int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004786 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004787 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004788{
Walter Dörwald4894c302003-10-24 14:25:28 +00004789 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004790 return -1;
4791 if (*res==NULL) {
4792 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004793 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004794 }
4795 else if (*res==Py_None)
4796 ;
4797 else if (PyInt_Check(*res)) {
4798 /* no overflow check, because we know that the space is enough */
4799 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4800 }
4801 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004802 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004803 if (repsize==1) {
4804 /* no overflow check, because we know that the space is enough */
4805 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4806 }
4807 else if (repsize!=0) {
4808 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004809 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004810 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004811 repsize - 1;
4812 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004813 return -1;
4814 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4815 *outp += repsize;
4816 }
4817 }
4818 else
4819 return -1;
4820 return 0;
4821}
4822
4823PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004824 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004825 PyObject *mapping,
4826 const char *errors)
4827{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004828 /* output object */
4829 PyObject *res = NULL;
4830 /* pointers to the beginning and end+1 of input */
4831 const Py_UNICODE *startp = p;
4832 const Py_UNICODE *endp = p + size;
4833 /* pointer into the output */
4834 Py_UNICODE *str;
4835 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004836 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004837 char *reason = "character maps to <undefined>";
4838 PyObject *errorHandler = NULL;
4839 PyObject *exc = NULL;
4840 /* the following variable is used for caching string comparisons
4841 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4842 * 3=ignore, 4=xmlcharrefreplace */
4843 int known_errorHandler = -1;
4844
Guido van Rossumd57fd912000-03-10 22:53:23 +00004845 if (mapping == NULL) {
4846 PyErr_BadArgument();
4847 return NULL;
4848 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004849
4850 /* allocate enough for a simple 1:1 translation without
4851 replacements, if we need more, we'll resize */
4852 res = PyUnicode_FromUnicode(NULL, size);
4853 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004854 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004855 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004856 return res;
4857 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004858
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004859 while (p<endp) {
4860 /* try to encode it */
4861 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004862 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004863 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004864 goto onError;
4865 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004866 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004867 if (x!=Py_None) /* it worked => adjust input pointer */
4868 ++p;
4869 else { /* untranslatable character */
4870 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004871 Py_ssize_t repsize;
4872 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004873 Py_UNICODE *uni2;
4874 /* startpos for collecting untranslatable chars */
4875 const Py_UNICODE *collstart = p;
4876 const Py_UNICODE *collend = p+1;
4877 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004878
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004879 /* find all untranslatable characters */
4880 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004881 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004882 goto onError;
4883 Py_XDECREF(x);
4884 if (x!=Py_None)
4885 break;
4886 ++collend;
4887 }
4888 /* cache callback name lookup
4889 * (if not done yet, i.e. it's the first error) */
4890 if (known_errorHandler==-1) {
4891 if ((errors==NULL) || (!strcmp(errors, "strict")))
4892 known_errorHandler = 1;
4893 else if (!strcmp(errors, "replace"))
4894 known_errorHandler = 2;
4895 else if (!strcmp(errors, "ignore"))
4896 known_errorHandler = 3;
4897 else if (!strcmp(errors, "xmlcharrefreplace"))
4898 known_errorHandler = 4;
4899 else
4900 known_errorHandler = 0;
4901 }
4902 switch (known_errorHandler) {
4903 case 1: /* strict */
4904 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4905 goto onError;
4906 case 2: /* replace */
4907 /* No need to check for space, this is a 1:1 replacement */
4908 for (coll = collstart; coll<collend; ++coll)
4909 *str++ = '?';
4910 /* fall through */
4911 case 3: /* ignore */
4912 p = collend;
4913 break;
4914 case 4: /* xmlcharrefreplace */
4915 /* generate replacement (temporarily (mis)uses p) */
4916 for (p = collstart; p < collend; ++p) {
4917 char buffer[2+29+1+1];
4918 char *cp;
4919 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004920 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004921 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4922 goto onError;
4923 for (cp = buffer; *cp; ++cp)
4924 *str++ = *cp;
4925 }
4926 p = collend;
4927 break;
4928 default:
4929 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4930 reason, startp, size, &exc,
4931 collstart-startp, collend-startp, &newpos);
4932 if (repunicode == NULL)
4933 goto onError;
4934 /* generate replacement */
4935 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004936 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004937 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4938 Py_DECREF(repunicode);
4939 goto onError;
4940 }
4941 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4942 *str++ = *uni2;
4943 p = startp + newpos;
4944 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004945 }
4946 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004947 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004948 /* Resize if we allocated to much */
4949 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004950 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004951 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004952 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004953 }
4954 Py_XDECREF(exc);
4955 Py_XDECREF(errorHandler);
4956 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004957
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004958 onError:
4959 Py_XDECREF(res);
4960 Py_XDECREF(exc);
4961 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004962 return NULL;
4963}
4964
4965PyObject *PyUnicode_Translate(PyObject *str,
4966 PyObject *mapping,
4967 const char *errors)
4968{
4969 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004970
Guido van Rossumd57fd912000-03-10 22:53:23 +00004971 str = PyUnicode_FromObject(str);
4972 if (str == NULL)
4973 goto onError;
4974 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4975 PyUnicode_GET_SIZE(str),
4976 mapping,
4977 errors);
4978 Py_DECREF(str);
4979 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004980
Guido van Rossumd57fd912000-03-10 22:53:23 +00004981 onError:
4982 Py_XDECREF(str);
4983 return NULL;
4984}
Tim Petersced69f82003-09-16 20:30:58 +00004985
Guido van Rossum9e896b32000-04-05 20:11:21 +00004986/* --- Decimal Encoder ---------------------------------------------------- */
4987
4988int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004989 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004990 char *output,
4991 const char *errors)
4992{
4993 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004994 PyObject *errorHandler = NULL;
4995 PyObject *exc = NULL;
4996 const char *encoding = "decimal";
4997 const char *reason = "invalid decimal Unicode string";
4998 /* the following variable is used for caching string comparisons
4999 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5000 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005001
5002 if (output == NULL) {
5003 PyErr_BadArgument();
5004 return -1;
5005 }
5006
5007 p = s;
5008 end = s + length;
5009 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005010 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005011 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005012 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005013 Py_ssize_t repsize;
5014 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005015 Py_UNICODE *uni2;
5016 Py_UNICODE *collstart;
5017 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005018
Guido van Rossum9e896b32000-04-05 20:11:21 +00005019 if (Py_UNICODE_ISSPACE(ch)) {
5020 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005021 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005022 continue;
5023 }
5024 decimal = Py_UNICODE_TODECIMAL(ch);
5025 if (decimal >= 0) {
5026 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005027 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005028 continue;
5029 }
Guido van Rossumba477042000-04-06 18:18:10 +00005030 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00005031 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005032 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005033 continue;
5034 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005035 /* All other characters are considered unencodable */
5036 collstart = p;
5037 collend = p+1;
5038 while (collend < end) {
5039 if ((0 < *collend && *collend < 256) ||
5040 !Py_UNICODE_ISSPACE(*collend) ||
5041 Py_UNICODE_TODECIMAL(*collend))
5042 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005043 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005044 /* cache callback name lookup
5045 * (if not done yet, i.e. it's the first error) */
5046 if (known_errorHandler==-1) {
5047 if ((errors==NULL) || (!strcmp(errors, "strict")))
5048 known_errorHandler = 1;
5049 else if (!strcmp(errors, "replace"))
5050 known_errorHandler = 2;
5051 else if (!strcmp(errors, "ignore"))
5052 known_errorHandler = 3;
5053 else if (!strcmp(errors, "xmlcharrefreplace"))
5054 known_errorHandler = 4;
5055 else
5056 known_errorHandler = 0;
5057 }
5058 switch (known_errorHandler) {
5059 case 1: /* strict */
5060 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5061 goto onError;
5062 case 2: /* replace */
5063 for (p = collstart; p < collend; ++p)
5064 *output++ = '?';
5065 /* fall through */
5066 case 3: /* ignore */
5067 p = collend;
5068 break;
5069 case 4: /* xmlcharrefreplace */
5070 /* generate replacement (temporarily (mis)uses p) */
5071 for (p = collstart; p < collend; ++p)
5072 output += sprintf(output, "&#%d;", (int)*p);
5073 p = collend;
5074 break;
5075 default:
5076 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5077 encoding, reason, s, length, &exc,
5078 collstart-s, collend-s, &newpos);
5079 if (repunicode == NULL)
5080 goto onError;
5081 /* generate replacement */
5082 repsize = PyUnicode_GET_SIZE(repunicode);
5083 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5084 Py_UNICODE ch = *uni2;
5085 if (Py_UNICODE_ISSPACE(ch))
5086 *output++ = ' ';
5087 else {
5088 decimal = Py_UNICODE_TODECIMAL(ch);
5089 if (decimal >= 0)
5090 *output++ = '0' + decimal;
5091 else if (0 < ch && ch < 256)
5092 *output++ = (char)ch;
5093 else {
5094 Py_DECREF(repunicode);
5095 raise_encode_exception(&exc, encoding,
5096 s, length, collstart-s, collend-s, reason);
5097 goto onError;
5098 }
5099 }
5100 }
5101 p = s + newpos;
5102 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005103 }
5104 }
5105 /* 0-terminate the output string */
5106 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005107 Py_XDECREF(exc);
5108 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005109 return 0;
5110
5111 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005112 Py_XDECREF(exc);
5113 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005114 return -1;
5115}
5116
Guido van Rossumd57fd912000-03-10 22:53:23 +00005117/* --- Helpers ------------------------------------------------------------ */
5118
Eric Smitha9f7d622008-02-17 19:46:49 +00005119#include "stringlib/unicodedefs.h"
Fredrik Lundh6471ee42006-05-24 14:28:11 +00005120
Facundo Batista6f7e6fb2007-11-16 19:16:15 +00005121#define FROM_UNICODE
Fredrik Lundhb9479482006-05-26 17:22:38 +00005122
Fredrik Lundha50d2012006-05-26 17:04:58 +00005123#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005124
5125#include "stringlib/count.h"
5126#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005127#include "stringlib/partition.h"
5128
Fredrik Lundhc8162812006-05-26 19:33:03 +00005129/* helper macro to fixup start/end slice values */
5130#define FIX_START_END(obj) \
5131 if (start < 0) \
5132 start += (obj)->length; \
5133 if (start < 0) \
5134 start = 0; \
5135 if (end > (obj)->length) \
5136 end = (obj)->length; \
5137 if (end < 0) \
5138 end += (obj)->length; \
5139 if (end < 0) \
5140 end = 0;
5141
Martin v. Löwis18e16552006-02-15 17:27:45 +00005142Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005143 PyObject *substr,
5144 Py_ssize_t start,
5145 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005146{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005147 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005148 PyUnicodeObject* str_obj;
5149 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005150
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005151 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5152 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005153 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005154 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5155 if (!sub_obj) {
5156 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005157 return -1;
5158 }
Tim Petersced69f82003-09-16 20:30:58 +00005159
Fredrik Lundhc8162812006-05-26 19:33:03 +00005160 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005161
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005162 result = stringlib_count(
5163 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5164 );
5165
5166 Py_DECREF(sub_obj);
5167 Py_DECREF(str_obj);
5168
Guido van Rossumd57fd912000-03-10 22:53:23 +00005169 return result;
5170}
5171
Martin v. Löwis18e16552006-02-15 17:27:45 +00005172Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005173 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005174 Py_ssize_t start,
5175 Py_ssize_t end,
5176 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005177{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005178 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005179
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005180 str = PyUnicode_FromObject(str);
5181 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005182 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005183 sub = PyUnicode_FromObject(sub);
5184 if (!sub) {
5185 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005186 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005187 }
Tim Petersced69f82003-09-16 20:30:58 +00005188
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005189 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005190 result = stringlib_find_slice(
5191 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5192 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5193 start, end
5194 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005195 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005196 result = stringlib_rfind_slice(
5197 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5198 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5199 start, end
5200 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005201
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005202 Py_DECREF(str);
5203 Py_DECREF(sub);
5204
Guido van Rossumd57fd912000-03-10 22:53:23 +00005205 return result;
5206}
5207
Tim Petersced69f82003-09-16 20:30:58 +00005208static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005209int tailmatch(PyUnicodeObject *self,
5210 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005211 Py_ssize_t start,
5212 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005213 int direction)
5214{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005215 if (substring->length == 0)
5216 return 1;
5217
Fredrik Lundhc8162812006-05-26 19:33:03 +00005218 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005219
5220 end -= substring->length;
5221 if (end < start)
5222 return 0;
5223
5224 if (direction > 0) {
5225 if (Py_UNICODE_MATCH(self, end, substring))
5226 return 1;
5227 } else {
5228 if (Py_UNICODE_MATCH(self, start, substring))
5229 return 1;
5230 }
5231
5232 return 0;
5233}
5234
Martin v. Löwis18e16552006-02-15 17:27:45 +00005235Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005236 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005237 Py_ssize_t start,
5238 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005239 int direction)
5240{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005241 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005242
Guido van Rossumd57fd912000-03-10 22:53:23 +00005243 str = PyUnicode_FromObject(str);
5244 if (str == NULL)
5245 return -1;
5246 substr = PyUnicode_FromObject(substr);
5247 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005248 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005249 return -1;
5250 }
Tim Petersced69f82003-09-16 20:30:58 +00005251
Guido van Rossumd57fd912000-03-10 22:53:23 +00005252 result = tailmatch((PyUnicodeObject *)str,
5253 (PyUnicodeObject *)substr,
5254 start, end, direction);
5255 Py_DECREF(str);
5256 Py_DECREF(substr);
5257 return result;
5258}
5259
Guido van Rossumd57fd912000-03-10 22:53:23 +00005260/* Apply fixfct filter to the Unicode object self and return a
5261 reference to the modified object */
5262
Tim Petersced69f82003-09-16 20:30:58 +00005263static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005264PyObject *fixup(PyUnicodeObject *self,
5265 int (*fixfct)(PyUnicodeObject *s))
5266{
5267
5268 PyUnicodeObject *u;
5269
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005270 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005271 if (u == NULL)
5272 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005273
5274 Py_UNICODE_COPY(u->str, self->str, self->length);
5275
Tim Peters7a29bd52001-09-12 03:03:31 +00005276 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005277 /* fixfct should return TRUE if it modified the buffer. If
5278 FALSE, return a reference to the original buffer instead
5279 (to save space, not time) */
5280 Py_INCREF(self);
5281 Py_DECREF(u);
5282 return (PyObject*) self;
5283 }
5284 return (PyObject*) u;
5285}
5286
Tim Petersced69f82003-09-16 20:30:58 +00005287static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005288int fixupper(PyUnicodeObject *self)
5289{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005290 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005291 Py_UNICODE *s = self->str;
5292 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005293
Guido van Rossumd57fd912000-03-10 22:53:23 +00005294 while (len-- > 0) {
5295 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005296
Guido van Rossumd57fd912000-03-10 22:53:23 +00005297 ch = Py_UNICODE_TOUPPER(*s);
5298 if (ch != *s) {
5299 status = 1;
5300 *s = ch;
5301 }
5302 s++;
5303 }
5304
5305 return status;
5306}
5307
Tim Petersced69f82003-09-16 20:30:58 +00005308static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005309int fixlower(PyUnicodeObject *self)
5310{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005311 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005312 Py_UNICODE *s = self->str;
5313 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005314
Guido van Rossumd57fd912000-03-10 22:53:23 +00005315 while (len-- > 0) {
5316 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005317
Guido van Rossumd57fd912000-03-10 22:53:23 +00005318 ch = Py_UNICODE_TOLOWER(*s);
5319 if (ch != *s) {
5320 status = 1;
5321 *s = ch;
5322 }
5323 s++;
5324 }
5325
5326 return status;
5327}
5328
Tim Petersced69f82003-09-16 20:30:58 +00005329static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005330int fixswapcase(PyUnicodeObject *self)
5331{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005332 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005333 Py_UNICODE *s = self->str;
5334 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005335
Guido van Rossumd57fd912000-03-10 22:53:23 +00005336 while (len-- > 0) {
5337 if (Py_UNICODE_ISUPPER(*s)) {
5338 *s = Py_UNICODE_TOLOWER(*s);
5339 status = 1;
5340 } else if (Py_UNICODE_ISLOWER(*s)) {
5341 *s = Py_UNICODE_TOUPPER(*s);
5342 status = 1;
5343 }
5344 s++;
5345 }
5346
5347 return status;
5348}
5349
Tim Petersced69f82003-09-16 20:30:58 +00005350static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005351int fixcapitalize(PyUnicodeObject *self)
5352{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005353 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005354 Py_UNICODE *s = self->str;
5355 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005356
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005357 if (len == 0)
5358 return 0;
5359 if (Py_UNICODE_ISLOWER(*s)) {
5360 *s = Py_UNICODE_TOUPPER(*s);
5361 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005362 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005363 s++;
5364 while (--len > 0) {
5365 if (Py_UNICODE_ISUPPER(*s)) {
5366 *s = Py_UNICODE_TOLOWER(*s);
5367 status = 1;
5368 }
5369 s++;
5370 }
5371 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005372}
5373
5374static
5375int fixtitle(PyUnicodeObject *self)
5376{
5377 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5378 register Py_UNICODE *e;
5379 int previous_is_cased;
5380
5381 /* Shortcut for single character strings */
5382 if (PyUnicode_GET_SIZE(self) == 1) {
5383 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5384 if (*p != ch) {
5385 *p = ch;
5386 return 1;
5387 }
5388 else
5389 return 0;
5390 }
Tim Petersced69f82003-09-16 20:30:58 +00005391
Guido van Rossumd57fd912000-03-10 22:53:23 +00005392 e = p + PyUnicode_GET_SIZE(self);
5393 previous_is_cased = 0;
5394 for (; p < e; p++) {
5395 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005396
Guido van Rossumd57fd912000-03-10 22:53:23 +00005397 if (previous_is_cased)
5398 *p = Py_UNICODE_TOLOWER(ch);
5399 else
5400 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005401
5402 if (Py_UNICODE_ISLOWER(ch) ||
5403 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005404 Py_UNICODE_ISTITLE(ch))
5405 previous_is_cased = 1;
5406 else
5407 previous_is_cased = 0;
5408 }
5409 return 1;
5410}
5411
Tim Peters8ce9f162004-08-27 01:49:32 +00005412PyObject *
5413PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005414{
Tim Peters8ce9f162004-08-27 01:49:32 +00005415 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005416 const Py_UNICODE blank = ' ';
5417 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00005418 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005419 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00005420 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5421 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005422 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5423 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005424 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005425 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005426 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005427
Tim Peters05eba1f2004-08-27 21:32:02 +00005428 fseq = PySequence_Fast(seq, "");
5429 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005430 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005431 }
5432
Tim Peters91879ab2004-08-27 22:35:44 +00005433 /* Grrrr. A codec may be invoked to convert str objects to
5434 * Unicode, and so it's possible to call back into Python code
5435 * during PyUnicode_FromObject(), and so it's possible for a sick
5436 * codec to change the size of fseq (if seq is a list). Therefore
5437 * we have to keep refetching the size -- can't assume seqlen
5438 * is invariant.
5439 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005440 seqlen = PySequence_Fast_GET_SIZE(fseq);
5441 /* If empty sequence, return u"". */
5442 if (seqlen == 0) {
5443 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5444 goto Done;
5445 }
5446 /* If singleton sequence with an exact Unicode, return that. */
5447 if (seqlen == 1) {
5448 item = PySequence_Fast_GET_ITEM(fseq, 0);
5449 if (PyUnicode_CheckExact(item)) {
5450 Py_INCREF(item);
5451 res = (PyUnicodeObject *)item;
5452 goto Done;
5453 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005454 }
5455
Tim Peters05eba1f2004-08-27 21:32:02 +00005456 /* At least two items to join, or one that isn't exact Unicode. */
5457 if (seqlen > 1) {
5458 /* Set up sep and seplen -- they're needed. */
5459 if (separator == NULL) {
5460 sep = &blank;
5461 seplen = 1;
5462 }
5463 else {
5464 internal_separator = PyUnicode_FromObject(separator);
5465 if (internal_separator == NULL)
5466 goto onError;
5467 sep = PyUnicode_AS_UNICODE(internal_separator);
5468 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00005469 /* In case PyUnicode_FromObject() mutated seq. */
5470 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005471 }
5472 }
5473
5474 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005475 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005476 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005477 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005478 res_p = PyUnicode_AS_UNICODE(res);
5479 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005480
Tim Peters05eba1f2004-08-27 21:32:02 +00005481 for (i = 0; i < seqlen; ++i) {
Tim Peters286085c2006-05-22 19:17:04 +00005482 Py_ssize_t itemlen;
5483 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005484
5485 item = PySequence_Fast_GET_ITEM(fseq, i);
5486 /* Convert item to Unicode. */
5487 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5488 PyErr_Format(PyExc_TypeError,
Thomas Wouters715a4cd2006-04-16 22:04:49 +00005489 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00005490 " %.80s found",
Christian Heimese93237d2007-12-19 02:37:44 +00005491 i, Py_TYPE(item)->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00005492 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005493 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005494 item = PyUnicode_FromObject(item);
5495 if (item == NULL)
5496 goto onError;
5497 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005498
Tim Peters91879ab2004-08-27 22:35:44 +00005499 /* In case PyUnicode_FromObject() mutated seq. */
5500 seqlen = PySequence_Fast_GET_SIZE(fseq);
5501
Tim Peters8ce9f162004-08-27 01:49:32 +00005502 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005503 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005504 new_res_used = res_used + itemlen;
Georg Brandl90e27d32006-06-10 06:40:50 +00005505 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005506 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005507 if (i < seqlen - 1) {
5508 new_res_used += seplen;
Georg Brandl90e27d32006-06-10 06:40:50 +00005509 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00005510 goto Overflow;
5511 }
5512 if (new_res_used > res_alloc) {
5513 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00005514 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00005515 res_alloc += res_alloc;
Tim Peters286085c2006-05-22 19:17:04 +00005516 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005517 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005518 } while (new_res_used > res_alloc);
Martin v. Löwis412fb672006-04-13 06:34:32 +00005519 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005520 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005521 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005522 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005523 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005524 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005525
5526 /* Copy item, and maybe the separator. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005527 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005528 res_p += itemlen;
5529 if (i < seqlen - 1) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00005530 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005531 res_p += seplen;
5532 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005533 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005534 res_used = new_res_used;
5535 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005536
Tim Peters05eba1f2004-08-27 21:32:02 +00005537 /* Shrink res to match the used area; this probably can't fail,
5538 * but it's cheap to check.
5539 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005540 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005541 goto onError;
5542
5543 Done:
5544 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005545 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005546 return (PyObject *)res;
5547
Tim Peters8ce9f162004-08-27 01:49:32 +00005548 Overflow:
5549 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00005550 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005551 Py_DECREF(item);
5552 /* fall through */
5553
Guido van Rossumd57fd912000-03-10 22:53:23 +00005554 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005555 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005556 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005557 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005558 return NULL;
5559}
5560
Tim Petersced69f82003-09-16 20:30:58 +00005561static
5562PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005563 Py_ssize_t left,
5564 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005565 Py_UNICODE fill)
5566{
5567 PyUnicodeObject *u;
5568
5569 if (left < 0)
5570 left = 0;
5571 if (right < 0)
5572 right = 0;
5573
Tim Peters7a29bd52001-09-12 03:03:31 +00005574 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005575 Py_INCREF(self);
5576 return self;
5577 }
5578
5579 u = _PyUnicode_New(left + self->length + right);
5580 if (u) {
5581 if (left)
5582 Py_UNICODE_FILL(u->str, fill, left);
5583 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5584 if (right)
5585 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5586 }
5587
5588 return u;
5589}
5590
5591#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005592 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005593 if (!str) \
5594 goto onError; \
5595 if (PyList_Append(list, str)) { \
5596 Py_DECREF(str); \
5597 goto onError; \
5598 } \
5599 else \
5600 Py_DECREF(str);
5601
5602static
5603PyObject *split_whitespace(PyUnicodeObject *self,
5604 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005605 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005606{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005607 register Py_ssize_t i;
5608 register Py_ssize_t j;
5609 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005610 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005611 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005612
5613 for (i = j = 0; i < len; ) {
5614 /* find a token */
Christian Heimes4d4f2702008-01-30 11:32:37 +00005615 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005616 i++;
5617 j = i;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005618 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005619 i++;
5620 if (j < i) {
5621 if (maxcount-- <= 0)
5622 break;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005623 SPLIT_APPEND(buf, j, i);
5624 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005625 i++;
5626 j = i;
5627 }
5628 }
5629 if (j < len) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005630 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005631 }
5632 return list;
5633
5634 onError:
5635 Py_DECREF(list);
5636 return NULL;
5637}
5638
5639PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005640 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005641{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005642 register Py_ssize_t i;
5643 register Py_ssize_t j;
5644 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005645 PyObject *list;
5646 PyObject *str;
5647 Py_UNICODE *data;
5648
5649 string = PyUnicode_FromObject(string);
5650 if (string == NULL)
5651 return NULL;
5652 data = PyUnicode_AS_UNICODE(string);
5653 len = PyUnicode_GET_SIZE(string);
5654
Guido van Rossumd57fd912000-03-10 22:53:23 +00005655 list = PyList_New(0);
5656 if (!list)
5657 goto onError;
5658
5659 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005660 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005661
Guido van Rossumd57fd912000-03-10 22:53:23 +00005662 /* Find a line and append it */
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005663 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005664 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005665
5666 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005667 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005668 if (i < len) {
5669 if (data[i] == '\r' && i + 1 < len &&
5670 data[i+1] == '\n')
5671 i += 2;
5672 else
5673 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005674 if (keepends)
5675 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005676 }
Guido van Rossum86662912000-04-11 15:38:46 +00005677 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005678 j = i;
5679 }
5680 if (j < len) {
5681 SPLIT_APPEND(data, j, len);
5682 }
5683
5684 Py_DECREF(string);
5685 return list;
5686
5687 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005688 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005689 Py_DECREF(string);
5690 return NULL;
5691}
5692
Tim Petersced69f82003-09-16 20:30:58 +00005693static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005694PyObject *split_char(PyUnicodeObject *self,
5695 PyObject *list,
5696 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005697 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005698{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005699 register Py_ssize_t i;
5700 register Py_ssize_t j;
5701 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005702 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005703 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005704
5705 for (i = j = 0; i < len; ) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005706 if (buf[i] == ch) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005707 if (maxcount-- <= 0)
5708 break;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005709 SPLIT_APPEND(buf, j, i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005710 i = j = i + 1;
5711 } else
5712 i++;
5713 }
5714 if (j <= len) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005715 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005716 }
5717 return list;
5718
5719 onError:
5720 Py_DECREF(list);
5721 return NULL;
5722}
5723
Tim Petersced69f82003-09-16 20:30:58 +00005724static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005725PyObject *split_substring(PyUnicodeObject *self,
5726 PyObject *list,
5727 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005728 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005729{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005730 register Py_ssize_t i;
5731 register Py_ssize_t j;
5732 Py_ssize_t len = self->length;
5733 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005734 PyObject *str;
5735
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005736 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005737 if (Py_UNICODE_MATCH(self, i, substring)) {
5738 if (maxcount-- <= 0)
5739 break;
5740 SPLIT_APPEND(self->str, j, i);
5741 i = j = i + sublen;
5742 } else
5743 i++;
5744 }
5745 if (j <= len) {
5746 SPLIT_APPEND(self->str, j, len);
5747 }
5748 return list;
5749
5750 onError:
5751 Py_DECREF(list);
5752 return NULL;
5753}
5754
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005755static
5756PyObject *rsplit_whitespace(PyUnicodeObject *self,
5757 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005758 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005759{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005760 register Py_ssize_t i;
5761 register Py_ssize_t j;
5762 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005763 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005764 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005765
5766 for (i = j = len - 1; i >= 0; ) {
5767 /* find a token */
Christian Heimes4d4f2702008-01-30 11:32:37 +00005768 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005769 i--;
5770 j = i;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005771 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005772 i--;
5773 if (j > i) {
5774 if (maxcount-- <= 0)
5775 break;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005776 SPLIT_APPEND(buf, i + 1, j + 1);
5777 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005778 i--;
5779 j = i;
5780 }
5781 }
5782 if (j >= 0) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005783 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005784 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005785 if (PyList_Reverse(list) < 0)
5786 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005787 return list;
5788
5789 onError:
5790 Py_DECREF(list);
5791 return NULL;
5792}
5793
5794static
5795PyObject *rsplit_char(PyUnicodeObject *self,
5796 PyObject *list,
5797 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005798 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005799{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005800 register Py_ssize_t i;
5801 register Py_ssize_t j;
5802 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005803 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005804 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005805
5806 for (i = j = len - 1; i >= 0; ) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005807 if (buf[i] == ch) {
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005808 if (maxcount-- <= 0)
5809 break;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005810 SPLIT_APPEND(buf, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005811 j = i = i - 1;
5812 } else
5813 i--;
5814 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005815 if (j >= -1) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005816 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005817 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005818 if (PyList_Reverse(list) < 0)
5819 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005820 return list;
5821
5822 onError:
5823 Py_DECREF(list);
5824 return NULL;
5825}
5826
5827static
5828PyObject *rsplit_substring(PyUnicodeObject *self,
5829 PyObject *list,
5830 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005831 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005832{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005833 register Py_ssize_t i;
5834 register Py_ssize_t j;
5835 Py_ssize_t len = self->length;
5836 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005837 PyObject *str;
5838
5839 for (i = len - sublen, j = len; i >= 0; ) {
5840 if (Py_UNICODE_MATCH(self, i, substring)) {
5841 if (maxcount-- <= 0)
5842 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005843 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005844 j = i;
5845 i -= sublen;
5846 } else
5847 i--;
5848 }
5849 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005850 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005851 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005852 if (PyList_Reverse(list) < 0)
5853 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005854 return list;
5855
5856 onError:
5857 Py_DECREF(list);
5858 return NULL;
5859}
5860
Guido van Rossumd57fd912000-03-10 22:53:23 +00005861#undef SPLIT_APPEND
5862
5863static
5864PyObject *split(PyUnicodeObject *self,
5865 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005866 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005867{
5868 PyObject *list;
5869
5870 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005871 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005872
5873 list = PyList_New(0);
5874 if (!list)
5875 return NULL;
5876
5877 if (substring == NULL)
5878 return split_whitespace(self,list,maxcount);
5879
5880 else if (substring->length == 1)
5881 return split_char(self,list,substring->str[0],maxcount);
5882
5883 else if (substring->length == 0) {
5884 Py_DECREF(list);
5885 PyErr_SetString(PyExc_ValueError, "empty separator");
5886 return NULL;
5887 }
5888 else
5889 return split_substring(self,list,substring,maxcount);
5890}
5891
Tim Petersced69f82003-09-16 20:30:58 +00005892static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005893PyObject *rsplit(PyUnicodeObject *self,
5894 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005895 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005896{
5897 PyObject *list;
5898
5899 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005900 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005901
5902 list = PyList_New(0);
5903 if (!list)
5904 return NULL;
5905
5906 if (substring == NULL)
5907 return rsplit_whitespace(self,list,maxcount);
5908
5909 else if (substring->length == 1)
5910 return rsplit_char(self,list,substring->str[0],maxcount);
5911
5912 else if (substring->length == 0) {
5913 Py_DECREF(list);
5914 PyErr_SetString(PyExc_ValueError, "empty separator");
5915 return NULL;
5916 }
5917 else
5918 return rsplit_substring(self,list,substring,maxcount);
5919}
5920
5921static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005922PyObject *replace(PyUnicodeObject *self,
5923 PyUnicodeObject *str1,
5924 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005925 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005926{
5927 PyUnicodeObject *u;
5928
5929 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005930 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005931
Fredrik Lundh347ee272006-05-24 16:35:18 +00005932 if (str1->length == str2->length) {
5933 /* same length */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005934 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005935 if (str1->length == 1) {
5936 /* replace characters */
5937 Py_UNICODE u1, u2;
5938 if (!findchar(self->str, self->length, str1->str[0]))
5939 goto nothing;
5940 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5941 if (!u)
5942 return NULL;
5943 Py_UNICODE_COPY(u->str, self->str, self->length);
5944 u1 = str1->str[0];
5945 u2 = str2->str[0];
5946 for (i = 0; i < u->length; i++)
5947 if (u->str[i] == u1) {
5948 if (--maxcount < 0)
5949 break;
5950 u->str[i] = u2;
5951 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005952 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005953 i = fastsearch(
5954 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005955 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00005956 if (i < 0)
5957 goto nothing;
5958 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5959 if (!u)
5960 return NULL;
5961 Py_UNICODE_COPY(u->str, self->str, self->length);
5962 while (i <= self->length - str1->length)
5963 if (Py_UNICODE_MATCH(self, i, str1)) {
5964 if (--maxcount < 0)
5965 break;
5966 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5967 i += str1->length;
5968 } else
5969 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005970 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005971 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005972
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005973 Py_ssize_t n, i, j, e;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005974 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005975 Py_UNICODE *p;
5976
5977 /* replace strings */
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005978 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005979 if (n > maxcount)
5980 n = maxcount;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005981 if (n == 0)
5982 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005983 /* new_size = self->length + n * (str2->length - str1->length)); */
5984 delta = (str2->length - str1->length);
5985 if (delta == 0) {
5986 new_size = self->length;
5987 } else {
5988 product = n * (str2->length - str1->length);
5989 if ((product / (str2->length - str1->length)) != n) {
5990 PyErr_SetString(PyExc_OverflowError,
5991 "replace string is too long");
5992 return NULL;
5993 }
5994 new_size = self->length + product;
5995 if (new_size < 0) {
5996 PyErr_SetString(PyExc_OverflowError,
5997 "replace string is too long");
5998 return NULL;
5999 }
6000 }
6001 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00006002 if (!u)
6003 return NULL;
6004 i = 0;
6005 p = u->str;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006006 e = self->length - str1->length;
Fredrik Lundh347ee272006-05-24 16:35:18 +00006007 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006008 while (n-- > 0) {
6009 /* look for next match */
6010 j = i;
6011 while (j <= e) {
6012 if (Py_UNICODE_MATCH(self, j, str1))
6013 break;
6014 j++;
6015 }
6016 if (j > i) {
6017 if (j > e)
6018 break;
6019 /* copy unchanged part [i:j] */
6020 Py_UNICODE_COPY(p, self->str+i, j-i);
6021 p += j - i;
6022 }
6023 /* copy substitution string */
6024 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00006025 Py_UNICODE_COPY(p, str2->str, str2->length);
6026 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006027 }
6028 i = j + str1->length;
6029 }
6030 if (i < self->length)
6031 /* copy tail [i:] */
6032 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00006033 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006034 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00006035 while (n > 0) {
6036 Py_UNICODE_COPY(p, str2->str, str2->length);
6037 p += str2->length;
6038 if (--n <= 0)
6039 break;
6040 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00006042 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006043 }
6044 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006045 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00006046
6047nothing:
6048 /* nothing to replace; return original string (when possible) */
6049 if (PyUnicode_CheckExact(self)) {
6050 Py_INCREF(self);
6051 return (PyObject *) self;
6052 }
6053 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006054}
6055
6056/* --- Unicode Object Methods --------------------------------------------- */
6057
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006058PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006059"S.title() -> unicode\n\
6060\n\
6061Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006062characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006063
6064static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006065unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006066{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006067 return fixup(self, fixtitle);
6068}
6069
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006070PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006071"S.capitalize() -> unicode\n\
6072\n\
6073Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006074have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006075
6076static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006077unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006078{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006079 return fixup(self, fixcapitalize);
6080}
6081
6082#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006083PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006084"S.capwords() -> unicode\n\
6085\n\
6086Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006087normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006088
6089static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006090unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006091{
6092 PyObject *list;
6093 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006094 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006095
Guido van Rossumd57fd912000-03-10 22:53:23 +00006096 /* Split into words */
6097 list = split(self, NULL, -1);
6098 if (!list)
6099 return NULL;
6100
6101 /* Capitalize each word */
6102 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6103 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6104 fixcapitalize);
6105 if (item == NULL)
6106 goto onError;
6107 Py_DECREF(PyList_GET_ITEM(list, i));
6108 PyList_SET_ITEM(list, i, item);
6109 }
6110
6111 /* Join the words to form a new string */
6112 item = PyUnicode_Join(NULL, list);
6113
6114onError:
6115 Py_DECREF(list);
6116 return (PyObject *)item;
6117}
6118#endif
6119
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006120/* Argument converter. Coerces to a single unicode character */
6121
6122static int
6123convert_uc(PyObject *obj, void *addr)
6124{
6125 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6126 PyObject *uniobj;
6127 Py_UNICODE *unistr;
6128
6129 uniobj = PyUnicode_FromObject(obj);
6130 if (uniobj == NULL) {
6131 PyErr_SetString(PyExc_TypeError,
6132 "The fill character cannot be converted to Unicode");
6133 return 0;
6134 }
6135 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6136 PyErr_SetString(PyExc_TypeError,
6137 "The fill character must be exactly one character long");
6138 Py_DECREF(uniobj);
6139 return 0;
6140 }
6141 unistr = PyUnicode_AS_UNICODE(uniobj);
6142 *fillcharloc = unistr[0];
6143 Py_DECREF(uniobj);
6144 return 1;
6145}
6146
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006147PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006148"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006150Return S centered in a Unicode string of length width. Padding is\n\
6151done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006152
6153static PyObject *
6154unicode_center(PyUnicodeObject *self, PyObject *args)
6155{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006156 Py_ssize_t marg, left;
6157 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006158 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006159
Thomas Woutersde017742006-02-16 19:34:37 +00006160 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006161 return NULL;
6162
Tim Peters7a29bd52001-09-12 03:03:31 +00006163 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006164 Py_INCREF(self);
6165 return (PyObject*) self;
6166 }
6167
6168 marg = width - self->length;
6169 left = marg / 2 + (marg & width & 1);
6170
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006171 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006172}
6173
Marc-André Lemburge5034372000-08-08 08:04:29 +00006174#if 0
6175
6176/* This code should go into some future Unicode collation support
6177 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006178 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006179
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006180/* speedy UTF-16 code point order comparison */
6181/* gleaned from: */
6182/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6183
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006184static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006185{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006186 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006187 0, 0, 0, 0, 0, 0, 0, 0,
6188 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006189 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006190};
6191
Guido van Rossumd57fd912000-03-10 22:53:23 +00006192static int
6193unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6194{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006195 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006196
Guido van Rossumd57fd912000-03-10 22:53:23 +00006197 Py_UNICODE *s1 = str1->str;
6198 Py_UNICODE *s2 = str2->str;
6199
6200 len1 = str1->length;
6201 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006202
Guido van Rossumd57fd912000-03-10 22:53:23 +00006203 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006204 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006205
6206 c1 = *s1++;
6207 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006208
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006209 if (c1 > (1<<11) * 26)
6210 c1 += utf16Fixup[c1>>11];
6211 if (c2 > (1<<11) * 26)
6212 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006213 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006214
6215 if (c1 != c2)
6216 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006217
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006218 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006219 }
6220
6221 return (len1 < len2) ? -1 : (len1 != len2);
6222}
6223
Marc-André Lemburge5034372000-08-08 08:04:29 +00006224#else
6225
6226static int
6227unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6228{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006229 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006230
6231 Py_UNICODE *s1 = str1->str;
6232 Py_UNICODE *s2 = str2->str;
6233
6234 len1 = str1->length;
6235 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006236
Marc-André Lemburge5034372000-08-08 08:04:29 +00006237 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006238 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006239
Fredrik Lundh45714e92001-06-26 16:39:36 +00006240 c1 = *s1++;
6241 c2 = *s2++;
6242
6243 if (c1 != c2)
6244 return (c1 < c2) ? -1 : 1;
6245
Marc-André Lemburge5034372000-08-08 08:04:29 +00006246 len1--; len2--;
6247 }
6248
6249 return (len1 < len2) ? -1 : (len1 != len2);
6250}
6251
6252#endif
6253
Guido van Rossumd57fd912000-03-10 22:53:23 +00006254int PyUnicode_Compare(PyObject *left,
6255 PyObject *right)
6256{
6257 PyUnicodeObject *u = NULL, *v = NULL;
6258 int result;
6259
6260 /* Coerce the two arguments */
6261 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6262 if (u == NULL)
6263 goto onError;
6264 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6265 if (v == NULL)
6266 goto onError;
6267
Thomas Wouters7e474022000-07-16 12:04:32 +00006268 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006269 if (v == u) {
6270 Py_DECREF(u);
6271 Py_DECREF(v);
6272 return 0;
6273 }
6274
6275 result = unicode_compare(u, v);
6276
6277 Py_DECREF(u);
6278 Py_DECREF(v);
6279 return result;
6280
6281onError:
6282 Py_XDECREF(u);
6283 Py_XDECREF(v);
6284 return -1;
6285}
6286
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006287PyObject *PyUnicode_RichCompare(PyObject *left,
6288 PyObject *right,
6289 int op)
6290{
6291 int result;
6292
6293 result = PyUnicode_Compare(left, right);
6294 if (result == -1 && PyErr_Occurred())
6295 goto onError;
6296
6297 /* Convert the return value to a Boolean */
6298 switch (op) {
6299 case Py_EQ:
6300 result = (result == 0);
6301 break;
6302 case Py_NE:
6303 result = (result != 0);
6304 break;
6305 case Py_LE:
6306 result = (result <= 0);
6307 break;
6308 case Py_GE:
6309 result = (result >= 0);
6310 break;
6311 case Py_LT:
6312 result = (result == -1);
6313 break;
6314 case Py_GT:
6315 result = (result == 1);
6316 break;
6317 }
6318 return PyBool_FromLong(result);
6319
6320 onError:
6321
6322 /* Standard case
6323
6324 Type errors mean that PyUnicode_FromObject() could not convert
6325 one of the arguments (usually the right hand side) to Unicode,
6326 ie. we can't handle the comparison request. However, it is
6327 possible that the other object knows a comparison method, which
6328 is why we return Py_NotImplemented to give the other object a
6329 chance.
6330
6331 */
6332 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6333 PyErr_Clear();
6334 Py_INCREF(Py_NotImplemented);
6335 return Py_NotImplemented;
6336 }
6337 if (op != Py_EQ && op != Py_NE)
6338 return NULL;
6339
6340 /* Equality comparison.
6341
6342 This is a special case: we silence any PyExc_UnicodeDecodeError
6343 and instead turn it into a PyErr_UnicodeWarning.
6344
6345 */
6346 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6347 return NULL;
6348 PyErr_Clear();
6349 if (PyErr_Warn(PyExc_UnicodeWarning,
6350 (op == Py_EQ) ?
6351 "Unicode equal comparison "
6352 "failed to convert both arguments to Unicode - "
6353 "interpreting them as being unequal" :
6354 "Unicode unequal comparison "
6355 "failed to convert both arguments to Unicode - "
6356 "interpreting them as being unequal"
6357 ) < 0)
6358 return NULL;
6359 result = (op == Py_NE);
6360 return PyBool_FromLong(result);
6361}
6362
Guido van Rossum403d68b2000-03-13 15:55:09 +00006363int PyUnicode_Contains(PyObject *container,
6364 PyObject *element)
6365{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006366 PyObject *str, *sub;
6367 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006368
6369 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006370 sub = PyUnicode_FromObject(element);
6371 if (!sub) {
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006372 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00006373 "'in <string>' requires string as left operand");
Fredrik Lundh833bf942006-05-23 10:12:21 +00006374 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006375 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00006376
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006377 str = PyUnicode_FromObject(container);
6378 if (!str) {
6379 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00006380 return -1;
6381 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00006382
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006383 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00006384
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006385 Py_DECREF(str);
6386 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00006387
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006388 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006389}
6390
Guido van Rossumd57fd912000-03-10 22:53:23 +00006391/* Concat to string or Unicode object giving a new Unicode object. */
6392
6393PyObject *PyUnicode_Concat(PyObject *left,
6394 PyObject *right)
6395{
6396 PyUnicodeObject *u = NULL, *v = NULL, *w;
6397
6398 /* Coerce the two arguments */
6399 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6400 if (u == NULL)
6401 goto onError;
6402 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6403 if (v == NULL)
6404 goto onError;
6405
6406 /* Shortcuts */
6407 if (v == unicode_empty) {
6408 Py_DECREF(v);
6409 return (PyObject *)u;
6410 }
6411 if (u == unicode_empty) {
6412 Py_DECREF(u);
6413 return (PyObject *)v;
6414 }
6415
6416 /* Concat the two Unicode strings */
6417 w = _PyUnicode_New(u->length + v->length);
6418 if (w == NULL)
6419 goto onError;
6420 Py_UNICODE_COPY(w->str, u->str, u->length);
6421 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6422
6423 Py_DECREF(u);
6424 Py_DECREF(v);
6425 return (PyObject *)w;
6426
6427onError:
6428 Py_XDECREF(u);
6429 Py_XDECREF(v);
6430 return NULL;
6431}
6432
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006433PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006434"S.count(sub[, start[, end]]) -> int\n\
6435\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00006436Return the number of non-overlapping occurrences of substring sub in\n\
6437Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006438interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006439
6440static PyObject *
6441unicode_count(PyUnicodeObject *self, PyObject *args)
6442{
6443 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006444 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006445 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006446 PyObject *result;
6447
Guido van Rossumb8872e62000-05-09 14:14:27 +00006448 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6449 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006450 return NULL;
6451
6452 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006453 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006454 if (substring == NULL)
6455 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006456
Fredrik Lundhc8162812006-05-26 19:33:03 +00006457 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006458
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006459 result = PyInt_FromSsize_t(
6460 stringlib_count(self->str + start, end - start,
6461 substring->str, substring->length)
6462 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006463
6464 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006465
Guido van Rossumd57fd912000-03-10 22:53:23 +00006466 return result;
6467}
6468
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006469PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006470"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006471\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006472Encodes S using the codec registered for encoding. encoding defaults\n\
6473to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006474handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006475a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6476'xmlcharrefreplace' as well as any other name registered with\n\
6477codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006478
6479static PyObject *
6480unicode_encode(PyUnicodeObject *self, PyObject *args)
6481{
6482 char *encoding = NULL;
6483 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006484 PyObject *v;
6485
Guido van Rossumd57fd912000-03-10 22:53:23 +00006486 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6487 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006488 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006489 if (v == NULL)
6490 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006491 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6492 PyErr_Format(PyExc_TypeError,
6493 "encoder did not return a string/unicode object "
6494 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006495 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006496 Py_DECREF(v);
6497 return NULL;
6498 }
6499 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006500
6501 onError:
6502 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006503}
6504
6505PyDoc_STRVAR(decode__doc__,
6506"S.decode([encoding[,errors]]) -> string or unicode\n\
6507\n\
6508Decodes S using the codec registered for encoding. encoding defaults\n\
6509to the default encoding. errors may be given to set a different error\n\
6510handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6511a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6512as well as any other name registerd with codecs.register_error that is\n\
6513able to handle UnicodeDecodeErrors.");
6514
6515static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00006516unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006517{
6518 char *encoding = NULL;
6519 char *errors = NULL;
6520 PyObject *v;
6521
6522 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
6523 return NULL;
6524 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006525 if (v == NULL)
6526 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006527 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6528 PyErr_Format(PyExc_TypeError,
6529 "decoder did not return a string/unicode object "
6530 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006531 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006532 Py_DECREF(v);
6533 return NULL;
6534 }
6535 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006536
6537 onError:
6538 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006539}
6540
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006541PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006542"S.expandtabs([tabsize]) -> unicode\n\
6543\n\
6544Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006545If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006546
6547static PyObject*
6548unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6549{
6550 Py_UNICODE *e;
6551 Py_UNICODE *p;
6552 Py_UNICODE *q;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006553 Py_UNICODE *qe;
6554 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006555 PyUnicodeObject *u;
6556 int tabsize = 8;
6557
6558 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6559 return NULL;
6560
Thomas Wouters7e474022000-07-16 12:04:32 +00006561 /* First pass: determine size of output string */
Guido van Rossum5bdff602008-03-11 21:18:06 +00006562 i = 0; /* chars up to and including most recent \n or \r */
6563 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6564 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006565 for (p = self->str; p < e; p++)
6566 if (*p == '\t') {
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006567 if (tabsize > 0) {
Guido van Rossum5bdff602008-03-11 21:18:06 +00006568 incr = tabsize - (j % tabsize); /* cannot overflow */
6569 if (j > PY_SSIZE_T_MAX - incr)
6570 goto overflow1;
6571 j += incr;
6572 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006573 }
6574 else {
Guido van Rossum5bdff602008-03-11 21:18:06 +00006575 if (j > PY_SSIZE_T_MAX - 1)
6576 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006577 j++;
6578 if (*p == '\n' || *p == '\r') {
Guido van Rossum5bdff602008-03-11 21:18:06 +00006579 if (i > PY_SSIZE_T_MAX - j)
6580 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006581 i += j;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006582 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006583 }
6584 }
6585
Guido van Rossum5bdff602008-03-11 21:18:06 +00006586 if (i > PY_SSIZE_T_MAX - j)
6587 goto overflow1;
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006588
Guido van Rossumd57fd912000-03-10 22:53:23 +00006589 /* Second pass: create output string and fill it */
6590 u = _PyUnicode_New(i + j);
6591 if (!u)
6592 return NULL;
6593
Guido van Rossum5bdff602008-03-11 21:18:06 +00006594 j = 0; /* same as in first pass */
6595 q = u->str; /* next output char */
6596 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006597
6598 for (p = self->str; p < e; p++)
6599 if (*p == '\t') {
6600 if (tabsize > 0) {
6601 i = tabsize - (j % tabsize);
6602 j += i;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006603 while (i--) {
6604 if (q >= qe)
6605 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006606 *q++ = ' ';
Guido van Rossum5bdff602008-03-11 21:18:06 +00006607 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006608 }
6609 }
6610 else {
Guido van Rossum5bdff602008-03-11 21:18:06 +00006611 if (q >= qe)
6612 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006613 *q++ = *p;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006614 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006615 if (*p == '\n' || *p == '\r')
6616 j = 0;
6617 }
6618
6619 return (PyObject*) u;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006620
6621 overflow2:
6622 Py_DECREF(u);
6623 overflow1:
6624 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6625 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006626}
6627
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006628PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006629"S.find(sub [,start [,end]]) -> int\n\
6630\n\
6631Return the lowest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00006632such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006633arguments start and end are interpreted as in slice notation.\n\
6634\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006635Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006636
6637static PyObject *
6638unicode_find(PyUnicodeObject *self, PyObject *args)
6639{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006640 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006641 Py_ssize_t start;
6642 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006643 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006644
Facundo Batista57d56692007-11-16 18:04:14 +00006645 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006646 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006647
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006648 result = stringlib_find_slice(
6649 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6650 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6651 start, end
6652 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006653
6654 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006655
6656 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006657}
6658
6659static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006660unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006661{
6662 if (index < 0 || index >= self->length) {
6663 PyErr_SetString(PyExc_IndexError, "string index out of range");
6664 return NULL;
6665 }
6666
6667 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6668}
6669
6670static long
6671unicode_hash(PyUnicodeObject *self)
6672{
Fredrik Lundhdde61642000-07-10 18:27:47 +00006673 /* Since Unicode objects compare equal to their ASCII string
6674 counterparts, they should use the individual character values
6675 as basis for their hash value. This is needed to assure that
6676 strings and Unicode objects behave in the same way as
6677 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006678
Martin v. Löwis18e16552006-02-15 17:27:45 +00006679 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006680 register Py_UNICODE *p;
6681 register long x;
6682
Guido van Rossumd57fd912000-03-10 22:53:23 +00006683 if (self->hash != -1)
6684 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006685 len = PyUnicode_GET_SIZE(self);
6686 p = PyUnicode_AS_UNICODE(self);
6687 x = *p << 7;
6688 while (--len >= 0)
6689 x = (1000003*x) ^ *p++;
6690 x ^= PyUnicode_GET_SIZE(self);
6691 if (x == -1)
6692 x = -2;
6693 self->hash = x;
6694 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006695}
6696
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006697PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006698"S.index(sub [,start [,end]]) -> int\n\
6699\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006700Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006701
6702static PyObject *
6703unicode_index(PyUnicodeObject *self, PyObject *args)
6704{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006705 Py_ssize_t result;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006706 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006707 Py_ssize_t start;
6708 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006709
Facundo Batista57d56692007-11-16 18:04:14 +00006710 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006711 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006712
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006713 result = stringlib_find_slice(
6714 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6715 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6716 start, end
6717 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006718
6719 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006720
Guido van Rossumd57fd912000-03-10 22:53:23 +00006721 if (result < 0) {
6722 PyErr_SetString(PyExc_ValueError, "substring not found");
6723 return NULL;
6724 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006725
Martin v. Löwis18e16552006-02-15 17:27:45 +00006726 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006727}
6728
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006729PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006730"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006731\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006732Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006733at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006734
6735static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006736unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006737{
6738 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6739 register const Py_UNICODE *e;
6740 int cased;
6741
Guido van Rossumd57fd912000-03-10 22:53:23 +00006742 /* Shortcut for single character strings */
6743 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006744 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006745
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006746 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006747 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006748 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006749
Guido van Rossumd57fd912000-03-10 22:53:23 +00006750 e = p + PyUnicode_GET_SIZE(self);
6751 cased = 0;
6752 for (; p < e; p++) {
6753 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006754
Guido van Rossumd57fd912000-03-10 22:53:23 +00006755 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006756 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006757 else if (!cased && Py_UNICODE_ISLOWER(ch))
6758 cased = 1;
6759 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006760 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006761}
6762
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006763PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006764"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006765\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006766Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006767at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006768
6769static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006770unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006771{
6772 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6773 register const Py_UNICODE *e;
6774 int cased;
6775
Guido van Rossumd57fd912000-03-10 22:53:23 +00006776 /* Shortcut for single character strings */
6777 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006778 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006779
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006780 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006781 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006782 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006783
Guido van Rossumd57fd912000-03-10 22:53:23 +00006784 e = p + PyUnicode_GET_SIZE(self);
6785 cased = 0;
6786 for (; p < e; p++) {
6787 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006788
Guido van Rossumd57fd912000-03-10 22:53:23 +00006789 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006790 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006791 else if (!cased && Py_UNICODE_ISUPPER(ch))
6792 cased = 1;
6793 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006794 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006795}
6796
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006797PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006798"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006799\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006800Return True if S is a titlecased string and there is at least one\n\
6801character in S, i.e. upper- and titlecase characters may only\n\
6802follow uncased characters and lowercase characters only cased ones.\n\
6803Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006804
6805static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006806unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006807{
6808 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6809 register const Py_UNICODE *e;
6810 int cased, previous_is_cased;
6811
Guido van Rossumd57fd912000-03-10 22:53:23 +00006812 /* Shortcut for single character strings */
6813 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006814 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6815 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006816
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006817 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006818 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006819 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006820
Guido van Rossumd57fd912000-03-10 22:53:23 +00006821 e = p + PyUnicode_GET_SIZE(self);
6822 cased = 0;
6823 previous_is_cased = 0;
6824 for (; p < e; p++) {
6825 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006826
Guido van Rossumd57fd912000-03-10 22:53:23 +00006827 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6828 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006829 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006830 previous_is_cased = 1;
6831 cased = 1;
6832 }
6833 else if (Py_UNICODE_ISLOWER(ch)) {
6834 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006835 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006836 previous_is_cased = 1;
6837 cased = 1;
6838 }
6839 else
6840 previous_is_cased = 0;
6841 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006842 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006843}
6844
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006845PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006846"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006847\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006848Return True if all characters in S are whitespace\n\
6849and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006850
6851static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006852unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006853{
6854 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6855 register const Py_UNICODE *e;
6856
Guido van Rossumd57fd912000-03-10 22:53:23 +00006857 /* Shortcut for single character strings */
6858 if (PyUnicode_GET_SIZE(self) == 1 &&
6859 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006860 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006861
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006862 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006863 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006864 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006865
Guido van Rossumd57fd912000-03-10 22:53:23 +00006866 e = p + PyUnicode_GET_SIZE(self);
6867 for (; p < e; p++) {
6868 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006869 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006870 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006871 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006872}
6873
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006874PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006875"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006876\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006877Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006878and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006879
6880static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006881unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006882{
6883 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6884 register const Py_UNICODE *e;
6885
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006886 /* Shortcut for single character strings */
6887 if (PyUnicode_GET_SIZE(self) == 1 &&
6888 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006889 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006890
6891 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006892 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006893 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006894
6895 e = p + PyUnicode_GET_SIZE(self);
6896 for (; p < e; p++) {
6897 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006898 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006899 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006900 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006901}
6902
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006903PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006904"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006905\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006906Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006907and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006908
6909static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006910unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006911{
6912 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6913 register const Py_UNICODE *e;
6914
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006915 /* Shortcut for single character strings */
6916 if (PyUnicode_GET_SIZE(self) == 1 &&
6917 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006918 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006919
6920 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006921 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006922 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006923
6924 e = p + PyUnicode_GET_SIZE(self);
6925 for (; p < e; p++) {
6926 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006927 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006928 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006929 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006930}
6931
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006932PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006933"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006934\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006935Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006936False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006937
6938static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006939unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006940{
6941 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6942 register const Py_UNICODE *e;
6943
Guido van Rossumd57fd912000-03-10 22:53:23 +00006944 /* Shortcut for single character strings */
6945 if (PyUnicode_GET_SIZE(self) == 1 &&
6946 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006947 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006948
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006949 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006950 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006951 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006952
Guido van Rossumd57fd912000-03-10 22:53:23 +00006953 e = p + PyUnicode_GET_SIZE(self);
6954 for (; p < e; p++) {
6955 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006956 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006957 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006958 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006959}
6960
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006961PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006962"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006963\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006964Return True if all characters in S are digits\n\
6965and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006966
6967static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006968unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006969{
6970 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6971 register const Py_UNICODE *e;
6972
Guido van Rossumd57fd912000-03-10 22:53:23 +00006973 /* Shortcut for single character strings */
6974 if (PyUnicode_GET_SIZE(self) == 1 &&
6975 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006976 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006977
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006978 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006979 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006980 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006981
Guido van Rossumd57fd912000-03-10 22:53:23 +00006982 e = p + PyUnicode_GET_SIZE(self);
6983 for (; p < e; p++) {
6984 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006985 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006986 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006987 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006988}
6989
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006990PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006991"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006992\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006993Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006994False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006995
6996static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006997unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006998{
6999 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7000 register const Py_UNICODE *e;
7001
Guido van Rossumd57fd912000-03-10 22:53:23 +00007002 /* Shortcut for single character strings */
7003 if (PyUnicode_GET_SIZE(self) == 1 &&
7004 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007005 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007006
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007007 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007008 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007009 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007010
Guido van Rossumd57fd912000-03-10 22:53:23 +00007011 e = p + PyUnicode_GET_SIZE(self);
7012 for (; p < e; p++) {
7013 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007014 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007015 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007016 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007017}
7018
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007019PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007020"S.join(sequence) -> unicode\n\
7021\n\
7022Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007023sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007024
7025static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007026unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007027{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007028 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007029}
7030
Martin v. Löwis18e16552006-02-15 17:27:45 +00007031static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007032unicode_length(PyUnicodeObject *self)
7033{
7034 return self->length;
7035}
7036
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007037PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00007038"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007039\n\
7040Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007041done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007042
7043static PyObject *
7044unicode_ljust(PyUnicodeObject *self, PyObject *args)
7045{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007046 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007047 Py_UNICODE fillchar = ' ';
7048
Martin v. Löwis412fb672006-04-13 06:34:32 +00007049 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007050 return NULL;
7051
Tim Peters7a29bd52001-09-12 03:03:31 +00007052 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007053 Py_INCREF(self);
7054 return (PyObject*) self;
7055 }
7056
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007057 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007058}
7059
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007060PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007061"S.lower() -> unicode\n\
7062\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007063Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007064
7065static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007066unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007067{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007068 return fixup(self, fixlower);
7069}
7070
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007071#define LEFTSTRIP 0
7072#define RIGHTSTRIP 1
7073#define BOTHSTRIP 2
7074
7075/* Arrays indexed by above */
7076static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7077
7078#define STRIPNAME(i) (stripformat[i]+3)
7079
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007080/* externally visible for str.strip(unicode) */
7081PyObject *
7082_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7083{
7084 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007085 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007086 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007087 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7088 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007089
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007090 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7091
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007092 i = 0;
7093 if (striptype != RIGHTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007094 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7095 i++;
7096 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007097 }
7098
7099 j = len;
7100 if (striptype != LEFTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007101 do {
7102 j--;
7103 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7104 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007105 }
7106
7107 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007108 Py_INCREF(self);
7109 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007110 }
7111 else
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007112 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007113}
7114
Guido van Rossumd57fd912000-03-10 22:53:23 +00007115
7116static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007117do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007118{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007119 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007120 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007121
7122 i = 0;
7123 if (striptype != RIGHTSTRIP) {
7124 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7125 i++;
7126 }
7127 }
7128
7129 j = len;
7130 if (striptype != LEFTSTRIP) {
7131 do {
7132 j--;
7133 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7134 j++;
7135 }
7136
7137 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7138 Py_INCREF(self);
7139 return (PyObject*)self;
7140 }
7141 else
7142 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007143}
7144
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007145
7146static PyObject *
7147do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7148{
7149 PyObject *sep = NULL;
7150
7151 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7152 return NULL;
7153
7154 if (sep != NULL && sep != Py_None) {
7155 if (PyUnicode_Check(sep))
7156 return _PyUnicode_XStrip(self, striptype, sep);
7157 else if (PyString_Check(sep)) {
7158 PyObject *res;
7159 sep = PyUnicode_FromObject(sep);
7160 if (sep==NULL)
7161 return NULL;
7162 res = _PyUnicode_XStrip(self, striptype, sep);
7163 Py_DECREF(sep);
7164 return res;
7165 }
7166 else {
7167 PyErr_Format(PyExc_TypeError,
7168 "%s arg must be None, unicode or str",
7169 STRIPNAME(striptype));
7170 return NULL;
7171 }
7172 }
7173
7174 return do_strip(self, striptype);
7175}
7176
7177
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007178PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007179"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007180\n\
7181Return a copy of the string S with leading and trailing\n\
7182whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007183If chars is given and not None, remove characters in chars instead.\n\
7184If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007185
7186static PyObject *
7187unicode_strip(PyUnicodeObject *self, PyObject *args)
7188{
7189 if (PyTuple_GET_SIZE(args) == 0)
7190 return do_strip(self, BOTHSTRIP); /* Common case */
7191 else
7192 return do_argstrip(self, BOTHSTRIP, args);
7193}
7194
7195
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007196PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007197"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007198\n\
7199Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007200If chars is given and not None, remove characters in chars instead.\n\
7201If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007202
7203static PyObject *
7204unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7205{
7206 if (PyTuple_GET_SIZE(args) == 0)
7207 return do_strip(self, LEFTSTRIP); /* Common case */
7208 else
7209 return do_argstrip(self, LEFTSTRIP, args);
7210}
7211
7212
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007213PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007214"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007215\n\
7216Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007217If chars is given and not None, remove characters in chars instead.\n\
7218If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007219
7220static PyObject *
7221unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7222{
7223 if (PyTuple_GET_SIZE(args) == 0)
7224 return do_strip(self, RIGHTSTRIP); /* Common case */
7225 else
7226 return do_argstrip(self, RIGHTSTRIP, args);
7227}
7228
7229
Guido van Rossumd57fd912000-03-10 22:53:23 +00007230static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007231unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007232{
7233 PyUnicodeObject *u;
7234 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007235 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007236 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007237
7238 if (len < 0)
7239 len = 0;
7240
Tim Peters7a29bd52001-09-12 03:03:31 +00007241 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007242 /* no repeat, return original string */
7243 Py_INCREF(str);
7244 return (PyObject*) str;
7245 }
Tim Peters8f422462000-09-09 06:13:41 +00007246
7247 /* ensure # of chars needed doesn't overflow int and # of bytes
7248 * needed doesn't overflow size_t
7249 */
7250 nchars = len * str->length;
7251 if (len && nchars / len != str->length) {
7252 PyErr_SetString(PyExc_OverflowError,
7253 "repeated string is too long");
7254 return NULL;
7255 }
7256 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7257 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7258 PyErr_SetString(PyExc_OverflowError,
7259 "repeated string is too long");
7260 return NULL;
7261 }
7262 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007263 if (!u)
7264 return NULL;
7265
7266 p = u->str;
7267
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007268 if (str->length == 1 && len > 0) {
7269 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007270 } else {
Tim Peters1bacc642006-05-23 05:47:16 +00007271 Py_ssize_t done = 0; /* number of characters copied this far */
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007272 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007273 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007274 done = str->length;
7275 }
7276 while (done < nchars) {
Neal Norwitz4677fbf72008-03-25 04:18:18 +00007277 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007278 Py_UNICODE_COPY(p+done, p, n);
7279 done += n;
7280 }
7281 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007282
7283 return (PyObject*) u;
7284}
7285
7286PyObject *PyUnicode_Replace(PyObject *obj,
7287 PyObject *subobj,
7288 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007289 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007290{
7291 PyObject *self;
7292 PyObject *str1;
7293 PyObject *str2;
7294 PyObject *result;
7295
7296 self = PyUnicode_FromObject(obj);
7297 if (self == NULL)
7298 return NULL;
7299 str1 = PyUnicode_FromObject(subobj);
7300 if (str1 == NULL) {
7301 Py_DECREF(self);
7302 return NULL;
7303 }
7304 str2 = PyUnicode_FromObject(replobj);
7305 if (str2 == NULL) {
7306 Py_DECREF(self);
7307 Py_DECREF(str1);
7308 return NULL;
7309 }
Tim Petersced69f82003-09-16 20:30:58 +00007310 result = replace((PyUnicodeObject *)self,
7311 (PyUnicodeObject *)str1,
7312 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007313 maxcount);
7314 Py_DECREF(self);
7315 Py_DECREF(str1);
7316 Py_DECREF(str2);
7317 return result;
7318}
7319
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007320PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007321"S.replace (old, new[, maxsplit]) -> unicode\n\
7322\n\
7323Return a copy of S with all occurrences of substring\n\
7324old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007325given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007326
7327static PyObject*
7328unicode_replace(PyUnicodeObject *self, PyObject *args)
7329{
7330 PyUnicodeObject *str1;
7331 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007332 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007333 PyObject *result;
7334
Martin v. Löwis18e16552006-02-15 17:27:45 +00007335 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007336 return NULL;
7337 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7338 if (str1 == NULL)
7339 return NULL;
7340 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007341 if (str2 == NULL) {
7342 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007343 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007344 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007345
7346 result = replace(self, str1, str2, maxcount);
7347
7348 Py_DECREF(str1);
7349 Py_DECREF(str2);
7350 return result;
7351}
7352
7353static
7354PyObject *unicode_repr(PyObject *unicode)
7355{
7356 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
7357 PyUnicode_GET_SIZE(unicode),
7358 1);
7359}
7360
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007361PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007362"S.rfind(sub [,start [,end]]) -> int\n\
7363\n\
7364Return the highest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00007365such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007366arguments start and end are interpreted as in slice notation.\n\
7367\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007368Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007369
7370static PyObject *
7371unicode_rfind(PyUnicodeObject *self, PyObject *args)
7372{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007373 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007374 Py_ssize_t start;
7375 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007376 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007377
Facundo Batista57d56692007-11-16 18:04:14 +00007378 if (!_ParseTupleFinds(args, &substring, &start, &end))
7379 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007380
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007381 result = stringlib_rfind_slice(
7382 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7383 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7384 start, end
7385 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007386
7387 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007388
7389 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007390}
7391
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007392PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007393"S.rindex(sub [,start [,end]]) -> int\n\
7394\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007395Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007396
7397static PyObject *
7398unicode_rindex(PyUnicodeObject *self, PyObject *args)
7399{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007400 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007401 Py_ssize_t start;
7402 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007403 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007404
Facundo Batista57d56692007-11-16 18:04:14 +00007405 if (!_ParseTupleFinds(args, &substring, &start, &end))
7406 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007407
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007408 result = stringlib_rfind_slice(
7409 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7410 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7411 start, end
7412 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007413
7414 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007415
Guido van Rossumd57fd912000-03-10 22:53:23 +00007416 if (result < 0) {
7417 PyErr_SetString(PyExc_ValueError, "substring not found");
7418 return NULL;
7419 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007420 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007421}
7422
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007423PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007424"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007425\n\
7426Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007427done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007428
7429static PyObject *
7430unicode_rjust(PyUnicodeObject *self, PyObject *args)
7431{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007432 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007433 Py_UNICODE fillchar = ' ';
7434
Martin v. Löwis412fb672006-04-13 06:34:32 +00007435 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007436 return NULL;
7437
Tim Peters7a29bd52001-09-12 03:03:31 +00007438 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007439 Py_INCREF(self);
7440 return (PyObject*) self;
7441 }
7442
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007443 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007444}
7445
Guido van Rossumd57fd912000-03-10 22:53:23 +00007446static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007447unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007448{
7449 /* standard clamping */
7450 if (start < 0)
7451 start = 0;
7452 if (end < 0)
7453 end = 0;
7454 if (end > self->length)
7455 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007456 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007457 /* full slice, return original string */
7458 Py_INCREF(self);
7459 return (PyObject*) self;
7460 }
7461 if (start > end)
7462 start = end;
7463 /* copy slice */
7464 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7465 end - start);
7466}
7467
7468PyObject *PyUnicode_Split(PyObject *s,
7469 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007470 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007471{
7472 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007473
Guido van Rossumd57fd912000-03-10 22:53:23 +00007474 s = PyUnicode_FromObject(s);
7475 if (s == NULL)
7476 return NULL;
7477 if (sep != NULL) {
7478 sep = PyUnicode_FromObject(sep);
7479 if (sep == NULL) {
7480 Py_DECREF(s);
7481 return NULL;
7482 }
7483 }
7484
7485 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7486
7487 Py_DECREF(s);
7488 Py_XDECREF(sep);
7489 return result;
7490}
7491
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007492PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007493"S.split([sep [,maxsplit]]) -> list of strings\n\
7494\n\
7495Return a list of the words in S, using sep as the\n\
7496delimiter string. If maxsplit is given, at most maxsplit\n\
Georg Brandldfb77db2008-05-11 09:11:40 +00007497splits are done. If sep is not specified or is None, any\n\
Georg Brandlecbbd942008-05-11 20:53:55 +00007498whitespace string is a separator and empty strings are\n\
7499removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007500
7501static PyObject*
7502unicode_split(PyUnicodeObject *self, PyObject *args)
7503{
7504 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007505 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007506
Martin v. Löwis18e16552006-02-15 17:27:45 +00007507 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007508 return NULL;
7509
7510 if (substring == Py_None)
7511 return split(self, NULL, maxcount);
7512 else if (PyUnicode_Check(substring))
7513 return split(self, (PyUnicodeObject *)substring, maxcount);
7514 else
7515 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7516}
7517
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007518PyObject *
7519PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7520{
7521 PyObject* str_obj;
7522 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007523 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00007524
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007525 str_obj = PyUnicode_FromObject(str_in);
7526 if (!str_obj)
7527 return NULL;
7528 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00007529 if (!sep_obj) {
7530 Py_DECREF(str_obj);
7531 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007532 }
7533
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007534 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00007535 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7536 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7537 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007538
Fredrik Lundhb9479482006-05-26 17:22:38 +00007539 Py_DECREF(sep_obj);
7540 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007541
7542 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007543}
7544
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007545
7546PyObject *
7547PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7548{
7549 PyObject* str_obj;
7550 PyObject* sep_obj;
7551 PyObject* out;
7552
7553 str_obj = PyUnicode_FromObject(str_in);
7554 if (!str_obj)
7555 return NULL;
7556 sep_obj = PyUnicode_FromObject(sep_in);
7557 if (!sep_obj) {
7558 Py_DECREF(str_obj);
7559 return NULL;
7560 }
7561
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007562 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007563 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7564 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7565 );
7566
7567 Py_DECREF(sep_obj);
7568 Py_DECREF(str_obj);
7569
7570 return out;
7571}
7572
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007573PyDoc_STRVAR(partition__doc__,
7574"S.partition(sep) -> (head, sep, tail)\n\
7575\n\
7576Searches for the separator sep in S, and returns the part before it,\n\
7577the separator itself, and the part after it. If the separator is not\n\
7578found, returns S and two empty strings.");
7579
7580static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00007581unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007582{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007583 return PyUnicode_Partition((PyObject *)self, separator);
7584}
7585
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007586PyDoc_STRVAR(rpartition__doc__,
Raymond Hettingera0c95fa2006-09-04 15:32:48 +00007587"S.rpartition(sep) -> (tail, sep, head)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007588\n\
7589Searches for the separator sep in S, starting at the end of S, and returns\n\
7590the part before it, the separator itself, and the part after it. If the\n\
Raymond Hettingera0c95fa2006-09-04 15:32:48 +00007591separator is not found, returns two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007592
7593static PyObject*
7594unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7595{
7596 return PyUnicode_RPartition((PyObject *)self, separator);
7597}
7598
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007599PyObject *PyUnicode_RSplit(PyObject *s,
7600 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007601 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007602{
7603 PyObject *result;
7604
7605 s = PyUnicode_FromObject(s);
7606 if (s == NULL)
7607 return NULL;
7608 if (sep != NULL) {
7609 sep = PyUnicode_FromObject(sep);
7610 if (sep == NULL) {
7611 Py_DECREF(s);
7612 return NULL;
7613 }
7614 }
7615
7616 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7617
7618 Py_DECREF(s);
7619 Py_XDECREF(sep);
7620 return result;
7621}
7622
7623PyDoc_STRVAR(rsplit__doc__,
7624"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7625\n\
7626Return a list of the words in S, using sep as the\n\
7627delimiter string, starting at the end of the string and\n\
7628working to the front. If maxsplit is given, at most maxsplit\n\
7629splits are done. If sep is not specified, any whitespace string\n\
7630is a separator.");
7631
7632static PyObject*
7633unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7634{
7635 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007636 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007637
Martin v. Löwis18e16552006-02-15 17:27:45 +00007638 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007639 return NULL;
7640
7641 if (substring == Py_None)
7642 return rsplit(self, NULL, maxcount);
7643 else if (PyUnicode_Check(substring))
7644 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7645 else
7646 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7647}
7648
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007649PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007650"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007651\n\
7652Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007653Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007654is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007655
7656static PyObject*
7657unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7658{
Guido van Rossum86662912000-04-11 15:38:46 +00007659 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007660
Guido van Rossum86662912000-04-11 15:38:46 +00007661 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007662 return NULL;
7663
Guido van Rossum86662912000-04-11 15:38:46 +00007664 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007665}
7666
7667static
7668PyObject *unicode_str(PyUnicodeObject *self)
7669{
Fred Drakee4315f52000-05-09 19:53:39 +00007670 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007671}
7672
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007673PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007674"S.swapcase() -> unicode\n\
7675\n\
7676Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007677and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007678
7679static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007680unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007681{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007682 return fixup(self, fixswapcase);
7683}
7684
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007685PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007686"S.translate(table) -> unicode\n\
7687\n\
7688Return a copy of the string S, where all characters have been mapped\n\
7689through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007690Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7691Unmapped characters are left untouched. Characters mapped to None\n\
7692are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007693
7694static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007695unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007696{
Tim Petersced69f82003-09-16 20:30:58 +00007697 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007698 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00007699 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007700 "ignore");
7701}
7702
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007703PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007704"S.upper() -> unicode\n\
7705\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007706Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007707
7708static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007709unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007710{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007711 return fixup(self, fixupper);
7712}
7713
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007714PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007715"S.zfill(width) -> unicode\n\
7716\n\
7717Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007718of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007719
7720static PyObject *
7721unicode_zfill(PyUnicodeObject *self, PyObject *args)
7722{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007723 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007724 PyUnicodeObject *u;
7725
Martin v. Löwis18e16552006-02-15 17:27:45 +00007726 Py_ssize_t width;
7727 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007728 return NULL;
7729
7730 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007731 if (PyUnicode_CheckExact(self)) {
7732 Py_INCREF(self);
7733 return (PyObject*) self;
7734 }
7735 else
7736 return PyUnicode_FromUnicode(
7737 PyUnicode_AS_UNICODE(self),
7738 PyUnicode_GET_SIZE(self)
7739 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007740 }
7741
7742 fill = width - self->length;
7743
7744 u = pad(self, fill, 0, '0');
7745
Walter Dörwald068325e2002-04-15 13:36:47 +00007746 if (u == NULL)
7747 return NULL;
7748
Guido van Rossumd57fd912000-03-10 22:53:23 +00007749 if (u->str[fill] == '+' || u->str[fill] == '-') {
7750 /* move sign to beginning of string */
7751 u->str[0] = u->str[fill];
7752 u->str[fill] = '0';
7753 }
7754
7755 return (PyObject*) u;
7756}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007757
7758#if 0
7759static PyObject*
Christian Heimes5b970ad2008-02-06 13:33:44 +00007760free_listsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007761{
Christian Heimes5b970ad2008-02-06 13:33:44 +00007762 return PyInt_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007763}
7764#endif
7765
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007766PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007767"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007768\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007769Return True if S starts with the specified prefix, False otherwise.\n\
7770With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007771With optional end, stop comparing S at that position.\n\
7772prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007773
7774static PyObject *
7775unicode_startswith(PyUnicodeObject *self,
7776 PyObject *args)
7777{
Georg Brandl24250812006-06-09 18:45:48 +00007778 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007779 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007780 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007781 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007782 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007783
Georg Brandl24250812006-06-09 18:45:48 +00007784 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007785 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007786 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007787 if (PyTuple_Check(subobj)) {
7788 Py_ssize_t i;
7789 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7790 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7791 PyTuple_GET_ITEM(subobj, i));
7792 if (substring == NULL)
7793 return NULL;
7794 result = tailmatch(self, substring, start, end, -1);
7795 Py_DECREF(substring);
7796 if (result) {
7797 Py_RETURN_TRUE;
7798 }
7799 }
7800 /* nothing matched */
7801 Py_RETURN_FALSE;
7802 }
7803 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007804 if (substring == NULL)
Georg Brandl24250812006-06-09 18:45:48 +00007805 return NULL;
7806 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007807 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007808 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007809}
7810
7811
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007812PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007813"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007814\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007815Return True if S ends with the specified suffix, False otherwise.\n\
7816With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007817With optional end, stop comparing S at that position.\n\
7818suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007819
7820static PyObject *
7821unicode_endswith(PyUnicodeObject *self,
7822 PyObject *args)
7823{
Georg Brandl24250812006-06-09 18:45:48 +00007824 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007825 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007826 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007827 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007828 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007829
Georg Brandl24250812006-06-09 18:45:48 +00007830 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7831 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007832 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007833 if (PyTuple_Check(subobj)) {
7834 Py_ssize_t i;
7835 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7836 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7837 PyTuple_GET_ITEM(subobj, i));
7838 if (substring == NULL)
7839 return NULL;
7840 result = tailmatch(self, substring, start, end, +1);
7841 Py_DECREF(substring);
7842 if (result) {
7843 Py_RETURN_TRUE;
7844 }
7845 }
7846 Py_RETURN_FALSE;
7847 }
7848 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007849 if (substring == NULL)
Georg Brandl24250812006-06-09 18:45:48 +00007850 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007851
Georg Brandl24250812006-06-09 18:45:48 +00007852 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007853 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007854 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007855}
7856
7857
Eric Smitha9f7d622008-02-17 19:46:49 +00007858/* Implements do_string_format, which is unicode because of stringlib */
7859#include "stringlib/string_format.h"
7860
7861PyDoc_STRVAR(format__doc__,
7862"S.format(*args, **kwargs) -> unicode\n\
7863\n\
7864");
7865
7866PyDoc_STRVAR(p_format__doc__,
7867"S.__format__(format_spec) -> unicode\n\
7868\n\
7869");
7870
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007871
7872static PyObject *
7873unicode_getnewargs(PyUnicodeObject *v)
7874{
7875 return Py_BuildValue("(u#)", v->str, v->length);
7876}
7877
7878
Guido van Rossumd57fd912000-03-10 22:53:23 +00007879static PyMethodDef unicode_methods[] = {
7880
7881 /* Order is according to common usage: often used methods should
7882 appear first, since lookup is done sequentially. */
7883
Georg Brandlecdc0a92006-03-30 12:19:07 +00007884 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007885 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7886 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007887 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007888 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7889 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7890 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7891 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7892 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7893 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7894 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00007895 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007896 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7897 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7898 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007899 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007900 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007901/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7902 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7903 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7904 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007905 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007906 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007907 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007908 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007909 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7910 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7911 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7912 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7913 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7914 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7915 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7916 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7917 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7918 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7919 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7920 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7921 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7922 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007923 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smitha9f7d622008-02-17 19:46:49 +00007924 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7925 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
7926 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
7927 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Walter Dörwald068325e2002-04-15 13:36:47 +00007928#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007929 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007930#endif
7931
7932#if 0
7933 /* This one is just used for debugging the implementation. */
Christian Heimes5b970ad2008-02-06 13:33:44 +00007934 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007935#endif
7936
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007937 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007938 {NULL, NULL}
7939};
7940
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007941static PyObject *
7942unicode_mod(PyObject *v, PyObject *w)
7943{
7944 if (!PyUnicode_Check(v)) {
7945 Py_INCREF(Py_NotImplemented);
7946 return Py_NotImplemented;
7947 }
7948 return PyUnicode_Format(v, w);
7949}
7950
7951static PyNumberMethods unicode_as_number = {
7952 0, /*nb_add*/
7953 0, /*nb_subtract*/
7954 0, /*nb_multiply*/
7955 0, /*nb_divide*/
7956 unicode_mod, /*nb_remainder*/
7957};
7958
Guido van Rossumd57fd912000-03-10 22:53:23 +00007959static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007960 (lenfunc) unicode_length, /* sq_length */
Georg Brandl347b3002006-03-30 11:57:00 +00007961 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007962 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7963 (ssizeargfunc) unicode_getitem, /* sq_item */
7964 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007965 0, /* sq_ass_item */
7966 0, /* sq_ass_slice */
Georg Brandl347b3002006-03-30 11:57:00 +00007967 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007968};
7969
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007970static PyObject*
7971unicode_subscript(PyUnicodeObject* self, PyObject* item)
7972{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00007973 if (PyIndex_Check(item)) {
7974 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007975 if (i == -1 && PyErr_Occurred())
7976 return NULL;
7977 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007978 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007979 return unicode_getitem(self, i);
7980 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007981 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007982 Py_UNICODE* source_buf;
7983 Py_UNICODE* result_buf;
7984 PyObject* result;
7985
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007986 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007987 &start, &stop, &step, &slicelength) < 0) {
7988 return NULL;
7989 }
7990
7991 if (slicelength <= 0) {
7992 return PyUnicode_FromUnicode(NULL, 0);
Thomas Wouters3ccec682007-08-28 15:28:19 +00007993 } else if (start == 0 && step == 1 && slicelength == self->length &&
7994 PyUnicode_CheckExact(self)) {
7995 Py_INCREF(self);
7996 return (PyObject *)self;
7997 } else if (step == 1) {
7998 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007999 } else {
8000 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Neal Norwitz419fd492008-03-17 20:22:43 +00008001 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8002 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008003
8004 if (result_buf == NULL)
8005 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008006
8007 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8008 result_buf[i] = source_buf[cur];
8009 }
Tim Petersced69f82003-09-16 20:30:58 +00008010
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008011 result = PyUnicode_FromUnicode(result_buf, slicelength);
Neal Norwitz419fd492008-03-17 20:22:43 +00008012 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008013 return result;
8014 }
8015 } else {
8016 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8017 return NULL;
8018 }
8019}
8020
8021static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008022 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008023 (binaryfunc)unicode_subscript, /* mp_subscript */
8024 (objobjargproc)0, /* mp_ass_subscript */
8025};
8026
Martin v. Löwis18e16552006-02-15 17:27:45 +00008027static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008028unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00008029 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008030 const void **ptr)
8031{
8032 if (index != 0) {
8033 PyErr_SetString(PyExc_SystemError,
8034 "accessing non-existent unicode segment");
8035 return -1;
8036 }
8037 *ptr = (void *) self->str;
8038 return PyUnicode_GET_DATA_SIZE(self);
8039}
8040
Martin v. Löwis18e16552006-02-15 17:27:45 +00008041static Py_ssize_t
8042unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008043 const void **ptr)
8044{
8045 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00008046 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008047 return -1;
8048}
8049
8050static int
8051unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00008052 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008053{
8054 if (lenp)
8055 *lenp = PyUnicode_GET_DATA_SIZE(self);
8056 return 1;
8057}
8058
Martin v. Löwiseb079f12006-02-16 14:32:27 +00008059static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008060unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00008061 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008062 const void **ptr)
8063{
8064 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00008065
Guido van Rossumd57fd912000-03-10 22:53:23 +00008066 if (index != 0) {
8067 PyErr_SetString(PyExc_SystemError,
8068 "accessing non-existent unicode segment");
8069 return -1;
8070 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008071 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008072 if (str == NULL)
8073 return -1;
8074 *ptr = (void *) PyString_AS_STRING(str);
8075 return PyString_GET_SIZE(str);
8076}
8077
8078/* Helpers for PyUnicode_Format() */
8079
8080static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008081getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008082{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008083 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008084 if (argidx < arglen) {
8085 (*p_argidx)++;
8086 if (arglen < 0)
8087 return args;
8088 else
8089 return PyTuple_GetItem(args, argidx);
8090 }
8091 PyErr_SetString(PyExc_TypeError,
8092 "not enough arguments for format string");
8093 return NULL;
8094}
8095
8096#define F_LJUST (1<<0)
8097#define F_SIGN (1<<1)
8098#define F_BLANK (1<<2)
8099#define F_ALT (1<<3)
8100#define F_ZERO (1<<4)
8101
Martin v. Löwis18e16552006-02-15 17:27:45 +00008102static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008103strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008104{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008105 register Py_ssize_t i;
8106 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008107 for (i = len - 1; i >= 0; i--)
8108 buffer[i] = (Py_UNICODE) charbuffer[i];
8109
Guido van Rossumd57fd912000-03-10 22:53:23 +00008110 return len;
8111}
8112
Neal Norwitzfc76d632006-01-10 06:03:13 +00008113static int
8114doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8115{
Tim Peters15231542006-02-16 01:08:01 +00008116 Py_ssize_t result;
8117
Neal Norwitzfc76d632006-01-10 06:03:13 +00008118 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008119 result = strtounicode(buffer, (char *)buffer);
8120 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008121}
8122
8123static int
8124longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8125{
Tim Peters15231542006-02-16 01:08:01 +00008126 Py_ssize_t result;
8127
Neal Norwitzfc76d632006-01-10 06:03:13 +00008128 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008129 result = strtounicode(buffer, (char *)buffer);
8130 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008131}
8132
Guido van Rossum078151d2002-08-11 04:24:12 +00008133/* XXX To save some code duplication, formatfloat/long/int could have been
8134 shared with stringobject.c, converting from 8-bit to Unicode after the
8135 formatting is done. */
8136
Guido van Rossumd57fd912000-03-10 22:53:23 +00008137static int
8138formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008139 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008140 int flags,
8141 int prec,
8142 int type,
8143 PyObject *v)
8144{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008145 /* fmt = '%#.' + `prec` + `type`
8146 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00008147 char fmt[20];
8148 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008149
Guido van Rossumd57fd912000-03-10 22:53:23 +00008150 x = PyFloat_AsDouble(v);
8151 if (x == -1.0 && PyErr_Occurred())
8152 return -1;
8153 if (prec < 0)
8154 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008155 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8156 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008157 /* Worst case length calc to ensure no buffer overrun:
8158
8159 'g' formats:
8160 fmt = %#.<prec>g
8161 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8162 for any double rep.)
8163 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8164
8165 'f' formats:
8166 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8167 len = 1 + 50 + 1 + prec = 52 + prec
8168
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008169 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008170 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008171
8172 */
Georg Brandl7c3b50d2007-07-12 08:38:00 +00008173 if (((type == 'g' || type == 'G') &&
8174 buflen <= (size_t)10 + (size_t)prec) ||
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008175 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008176 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008177 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008178 return -1;
8179 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008180 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8181 (flags&F_ALT) ? "#" : "",
8182 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008183 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008184}
8185
Tim Peters38fd5b62000-09-21 05:43:11 +00008186static PyObject*
8187formatlong(PyObject *val, int flags, int prec, int type)
8188{
8189 char *buf;
8190 int i, len;
8191 PyObject *str; /* temporary string object. */
8192 PyUnicodeObject *result;
8193
8194 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8195 if (!str)
8196 return NULL;
8197 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008198 if (!result) {
8199 Py_DECREF(str);
8200 return NULL;
8201 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008202 for (i = 0; i < len; i++)
8203 result->str[i] = buf[i];
8204 result->str[len] = 0;
8205 Py_DECREF(str);
8206 return (PyObject*)result;
8207}
8208
Guido van Rossumd57fd912000-03-10 22:53:23 +00008209static int
8210formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008211 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008212 int flags,
8213 int prec,
8214 int type,
8215 PyObject *v)
8216{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008217 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008218 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8219 * + 1 + 1
8220 * = 24
8221 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008222 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008223 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008224 long x;
8225
8226 x = PyInt_AsLong(v);
8227 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008228 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008229 if (x < 0 && type == 'u') {
8230 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008231 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008232 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8233 sign = "-";
8234 else
8235 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008236 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008237 prec = 1;
8238
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008239 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8240 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008241 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008242 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008243 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008244 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008245 return -1;
8246 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008247
8248 if ((flags & F_ALT) &&
8249 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00008250 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008251 * of issues that cause pain:
8252 * - when 0 is being converted, the C standard leaves off
8253 * the '0x' or '0X', which is inconsistent with other
8254 * %#x/%#X conversions and inconsistent with Python's
8255 * hex() function
8256 * - there are platforms that violate the standard and
8257 * convert 0 with the '0x' or '0X'
8258 * (Metrowerks, Compaq Tru64)
8259 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008260 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008261 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008262 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008263 * We can achieve the desired consistency by inserting our
8264 * own '0x' or '0X' prefix, and substituting %x/%X in place
8265 * of %#x/%#X.
8266 *
8267 * Note that this is the same approach as used in
8268 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008269 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008270 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8271 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008272 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008273 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008274 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8275 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008276 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008277 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008278 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008279 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008280 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008281 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008282}
8283
8284static int
8285formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008286 size_t buflen,
8287 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008288{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008289 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008290 if (PyUnicode_Check(v)) {
8291 if (PyUnicode_GET_SIZE(v) != 1)
8292 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008293 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008294 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008295
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008296 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00008297 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008298 goto onError;
8299 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
8300 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008301
8302 else {
8303 /* Integer input truncated to a character */
8304 long x;
8305 x = PyInt_AsLong(v);
8306 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008307 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008308#ifdef Py_UNICODE_WIDE
8309 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008310 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008311 "%c arg not in range(0x110000) "
8312 "(wide Python build)");
8313 return -1;
8314 }
8315#else
8316 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008317 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008318 "%c arg not in range(0x10000) "
8319 "(narrow Python build)");
8320 return -1;
8321 }
8322#endif
8323 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008324 }
8325 buf[1] = '\0';
8326 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008327
8328 onError:
8329 PyErr_SetString(PyExc_TypeError,
8330 "%c requires int or char");
8331 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008332}
8333
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008334/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8335
8336 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8337 chars are formatted. XXX This is a magic number. Each formatting
8338 routine does bounds checking to ensure no overflow, but a better
8339 solution may be to malloc a buffer of appropriate size for each
8340 format. For now, the current solution is sufficient.
8341*/
8342#define FORMATBUFLEN (size_t)120
8343
Guido van Rossumd57fd912000-03-10 22:53:23 +00008344PyObject *PyUnicode_Format(PyObject *format,
8345 PyObject *args)
8346{
8347 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008348 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008349 int args_owned = 0;
8350 PyUnicodeObject *result = NULL;
8351 PyObject *dict = NULL;
8352 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008353
Guido van Rossumd57fd912000-03-10 22:53:23 +00008354 if (format == NULL || args == NULL) {
8355 PyErr_BadInternalCall();
8356 return NULL;
8357 }
8358 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008359 if (uformat == NULL)
8360 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008361 fmt = PyUnicode_AS_UNICODE(uformat);
8362 fmtcnt = PyUnicode_GET_SIZE(uformat);
8363
8364 reslen = rescnt = fmtcnt + 100;
8365 result = _PyUnicode_New(reslen);
8366 if (result == NULL)
8367 goto onError;
8368 res = PyUnicode_AS_UNICODE(result);
8369
8370 if (PyTuple_Check(args)) {
8371 arglen = PyTuple_Size(args);
8372 argidx = 0;
8373 }
8374 else {
8375 arglen = -1;
8376 argidx = -2;
8377 }
Christian Heimese93237d2007-12-19 02:37:44 +00008378 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008379 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008380 dict = args;
8381
8382 while (--fmtcnt >= 0) {
8383 if (*fmt != '%') {
8384 if (--rescnt < 0) {
8385 rescnt = fmtcnt + 100;
8386 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008387 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008388 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008389 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8390 --rescnt;
8391 }
8392 *res++ = *fmt++;
8393 }
8394 else {
8395 /* Got a format specifier */
8396 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008397 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008398 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008399 Py_UNICODE c = '\0';
8400 Py_UNICODE fill;
Facundo Batistac11cecf2008-02-24 03:17:21 +00008401 int isnumok;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008402 PyObject *v = NULL;
8403 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008404 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008405 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008406 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008407 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008408
8409 fmt++;
8410 if (*fmt == '(') {
8411 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008412 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008413 PyObject *key;
8414 int pcount = 1;
8415
8416 if (dict == NULL) {
8417 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008418 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008419 goto onError;
8420 }
8421 ++fmt;
8422 --fmtcnt;
8423 keystart = fmt;
8424 /* Skip over balanced parentheses */
8425 while (pcount > 0 && --fmtcnt >= 0) {
8426 if (*fmt == ')')
8427 --pcount;
8428 else if (*fmt == '(')
8429 ++pcount;
8430 fmt++;
8431 }
8432 keylen = fmt - keystart - 1;
8433 if (fmtcnt < 0 || pcount > 0) {
8434 PyErr_SetString(PyExc_ValueError,
8435 "incomplete format key");
8436 goto onError;
8437 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008438#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008439 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008440 then looked up since Python uses strings to hold
8441 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008442 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008443 key = PyUnicode_EncodeUTF8(keystart,
8444 keylen,
8445 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008446#else
8447 key = PyUnicode_FromUnicode(keystart, keylen);
8448#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008449 if (key == NULL)
8450 goto onError;
8451 if (args_owned) {
8452 Py_DECREF(args);
8453 args_owned = 0;
8454 }
8455 args = PyObject_GetItem(dict, key);
8456 Py_DECREF(key);
8457 if (args == NULL) {
8458 goto onError;
8459 }
8460 args_owned = 1;
8461 arglen = -1;
8462 argidx = -2;
8463 }
8464 while (--fmtcnt >= 0) {
8465 switch (c = *fmt++) {
8466 case '-': flags |= F_LJUST; continue;
8467 case '+': flags |= F_SIGN; continue;
8468 case ' ': flags |= F_BLANK; continue;
8469 case '#': flags |= F_ALT; continue;
8470 case '0': flags |= F_ZERO; continue;
8471 }
8472 break;
8473 }
8474 if (c == '*') {
8475 v = getnextarg(args, arglen, &argidx);
8476 if (v == NULL)
8477 goto onError;
8478 if (!PyInt_Check(v)) {
8479 PyErr_SetString(PyExc_TypeError,
8480 "* wants int");
8481 goto onError;
8482 }
8483 width = PyInt_AsLong(v);
8484 if (width < 0) {
8485 flags |= F_LJUST;
8486 width = -width;
8487 }
8488 if (--fmtcnt >= 0)
8489 c = *fmt++;
8490 }
8491 else if (c >= '0' && c <= '9') {
8492 width = c - '0';
8493 while (--fmtcnt >= 0) {
8494 c = *fmt++;
8495 if (c < '0' || c > '9')
8496 break;
8497 if ((width*10) / 10 != width) {
8498 PyErr_SetString(PyExc_ValueError,
8499 "width too big");
8500 goto onError;
8501 }
8502 width = width*10 + (c - '0');
8503 }
8504 }
8505 if (c == '.') {
8506 prec = 0;
8507 if (--fmtcnt >= 0)
8508 c = *fmt++;
8509 if (c == '*') {
8510 v = getnextarg(args, arglen, &argidx);
8511 if (v == NULL)
8512 goto onError;
8513 if (!PyInt_Check(v)) {
8514 PyErr_SetString(PyExc_TypeError,
8515 "* wants int");
8516 goto onError;
8517 }
8518 prec = PyInt_AsLong(v);
8519 if (prec < 0)
8520 prec = 0;
8521 if (--fmtcnt >= 0)
8522 c = *fmt++;
8523 }
8524 else if (c >= '0' && c <= '9') {
8525 prec = c - '0';
8526 while (--fmtcnt >= 0) {
8527 c = Py_CHARMASK(*fmt++);
8528 if (c < '0' || c > '9')
8529 break;
8530 if ((prec*10) / 10 != prec) {
8531 PyErr_SetString(PyExc_ValueError,
8532 "prec too big");
8533 goto onError;
8534 }
8535 prec = prec*10 + (c - '0');
8536 }
8537 }
8538 } /* prec */
8539 if (fmtcnt >= 0) {
8540 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008541 if (--fmtcnt >= 0)
8542 c = *fmt++;
8543 }
8544 }
8545 if (fmtcnt < 0) {
8546 PyErr_SetString(PyExc_ValueError,
8547 "incomplete format");
8548 goto onError;
8549 }
8550 if (c != '%') {
8551 v = getnextarg(args, arglen, &argidx);
8552 if (v == NULL)
8553 goto onError;
8554 }
8555 sign = 0;
8556 fill = ' ';
8557 switch (c) {
8558
8559 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008560 pbuf = formatbuf;
8561 /* presume that buffer length is at least 1 */
8562 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008563 len = 1;
8564 break;
8565
8566 case 's':
8567 case 'r':
8568 if (PyUnicode_Check(v) && c == 's') {
8569 temp = v;
8570 Py_INCREF(temp);
8571 }
8572 else {
8573 PyObject *unicode;
8574 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008575 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008576 else
8577 temp = PyObject_Repr(v);
8578 if (temp == NULL)
8579 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008580 if (PyUnicode_Check(temp))
8581 /* nothing to do */;
8582 else if (PyString_Check(temp)) {
8583 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00008584 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00008585 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00008586 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008587 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008588 Py_DECREF(temp);
8589 temp = unicode;
8590 if (temp == NULL)
8591 goto onError;
8592 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008593 else {
8594 Py_DECREF(temp);
8595 PyErr_SetString(PyExc_TypeError,
8596 "%s argument has non-string str()");
8597 goto onError;
8598 }
8599 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008600 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008601 len = PyUnicode_GET_SIZE(temp);
8602 if (prec >= 0 && len > prec)
8603 len = prec;
8604 break;
8605
8606 case 'i':
8607 case 'd':
8608 case 'u':
8609 case 'o':
8610 case 'x':
8611 case 'X':
8612 if (c == 'i')
8613 c = 'd';
Facundo Batistac11cecf2008-02-24 03:17:21 +00008614 isnumok = 0;
8615 if (PyNumber_Check(v)) {
8616 PyObject *iobj=NULL;
8617
8618 if (PyInt_Check(v) || (PyLong_Check(v))) {
8619 iobj = v;
8620 Py_INCREF(iobj);
8621 }
8622 else {
8623 iobj = PyNumber_Int(v);
8624 if (iobj==NULL) iobj = PyNumber_Long(v);
8625 }
8626 if (iobj!=NULL) {
8627 if (PyInt_Check(iobj)) {
8628 isnumok = 1;
8629 pbuf = formatbuf;
8630 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8631 flags, prec, c, iobj);
8632 Py_DECREF(iobj);
8633 if (len < 0)
8634 goto onError;
8635 sign = 1;
8636 }
8637 else if (PyLong_Check(iobj)) {
8638 isnumok = 1;
8639 temp = formatlong(iobj, flags, prec, c);
8640 Py_DECREF(iobj);
8641 if (!temp)
8642 goto onError;
8643 pbuf = PyUnicode_AS_UNICODE(temp);
8644 len = PyUnicode_GET_SIZE(temp);
8645 sign = 1;
8646 }
8647 else {
8648 Py_DECREF(iobj);
8649 }
8650 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008651 }
Facundo Batistac11cecf2008-02-24 03:17:21 +00008652 if (!isnumok) {
8653 PyErr_Format(PyExc_TypeError,
8654 "%%%c format: a number is required, "
Martin v. Löwisd918e4e2008-04-07 03:08:28 +00008655 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
Tim Peters38fd5b62000-09-21 05:43:11 +00008656 goto onError;
Tim Peters38fd5b62000-09-21 05:43:11 +00008657 }
8658 if (flags & F_ZERO)
8659 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008660 break;
8661
8662 case 'e':
8663 case 'E':
8664 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008665 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008666 case 'g':
8667 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008668 if (c == 'F')
8669 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008670 pbuf = formatbuf;
8671 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8672 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008673 if (len < 0)
8674 goto onError;
8675 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008676 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008677 fill = '0';
8678 break;
8679
8680 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008681 pbuf = formatbuf;
8682 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008683 if (len < 0)
8684 goto onError;
8685 break;
8686
8687 default:
8688 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008689 "unsupported format character '%c' (0x%x) "
Armin Rigo7ccbca92006-10-04 12:17:45 +00008690 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008691 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008692 (int)c,
Armin Rigo7ccbca92006-10-04 12:17:45 +00008693 (Py_ssize_t)(fmt - 1 -
8694 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008695 goto onError;
8696 }
8697 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008698 if (*pbuf == '-' || *pbuf == '+') {
8699 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008700 len--;
8701 }
8702 else if (flags & F_SIGN)
8703 sign = '+';
8704 else if (flags & F_BLANK)
8705 sign = ' ';
8706 else
8707 sign = 0;
8708 }
8709 if (width < len)
8710 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008711 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008712 reslen -= rescnt;
8713 rescnt = width + fmtcnt + 100;
8714 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008715 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008716 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008717 PyErr_NoMemory();
8718 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008719 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008720 if (_PyUnicode_Resize(&result, reslen) < 0) {
8721 Py_XDECREF(temp);
8722 goto onError;
8723 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008724 res = PyUnicode_AS_UNICODE(result)
8725 + reslen - rescnt;
8726 }
8727 if (sign) {
8728 if (fill != ' ')
8729 *res++ = sign;
8730 rescnt--;
8731 if (width > len)
8732 width--;
8733 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008734 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8735 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008736 assert(pbuf[1] == c);
8737 if (fill != ' ') {
8738 *res++ = *pbuf++;
8739 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008740 }
Tim Petersfff53252001-04-12 18:38:48 +00008741 rescnt -= 2;
8742 width -= 2;
8743 if (width < 0)
8744 width = 0;
8745 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008746 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008747 if (width > len && !(flags & F_LJUST)) {
8748 do {
8749 --rescnt;
8750 *res++ = fill;
8751 } while (--width > len);
8752 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008753 if (fill == ' ') {
8754 if (sign)
8755 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00008756 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008757 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008758 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008759 *res++ = *pbuf++;
8760 *res++ = *pbuf++;
8761 }
8762 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008763 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008764 res += len;
8765 rescnt -= len;
8766 while (--width >= len) {
8767 --rescnt;
8768 *res++ = ' ';
8769 }
8770 if (dict && (argidx < arglen) && c != '%') {
8771 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008772 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008773 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008774 goto onError;
8775 }
8776 Py_XDECREF(temp);
8777 } /* '%' */
8778 } /* until end */
8779 if (argidx < arglen && !dict) {
8780 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008781 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008782 goto onError;
8783 }
8784
Thomas Woutersa96affe2006-03-12 00:29:36 +00008785 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8786 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008787 if (args_owned) {
8788 Py_DECREF(args);
8789 }
8790 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008791 return (PyObject *)result;
8792
8793 onError:
8794 Py_XDECREF(result);
8795 Py_DECREF(uformat);
8796 if (args_owned) {
8797 Py_DECREF(args);
8798 }
8799 return NULL;
8800}
8801
8802static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008803 (readbufferproc) unicode_buffer_getreadbuf,
8804 (writebufferproc) unicode_buffer_getwritebuf,
8805 (segcountproc) unicode_buffer_getsegcount,
8806 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008807};
8808
Jeremy Hylton938ace62002-07-17 16:30:39 +00008809static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008810unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8811
Tim Peters6d6c1a32001-08-02 04:15:00 +00008812static PyObject *
8813unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8814{
8815 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00008816 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008817 char *encoding = NULL;
8818 char *errors = NULL;
8819
Guido van Rossume023fe02001-08-30 03:12:59 +00008820 if (type != &PyUnicode_Type)
8821 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008822 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8823 kwlist, &x, &encoding, &errors))
8824 return NULL;
8825 if (x == NULL)
8826 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008827 if (encoding == NULL && errors == NULL)
8828 return PyObject_Unicode(x);
8829 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008830 return PyUnicode_FromEncodedObject(x, encoding, errors);
8831}
8832
Guido van Rossume023fe02001-08-30 03:12:59 +00008833static PyObject *
8834unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8835{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008836 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008837 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008838
8839 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8840 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8841 if (tmp == NULL)
8842 return NULL;
8843 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008844 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008845 if (pnew == NULL) {
8846 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008847 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008848 }
Neal Norwitz419fd492008-03-17 20:22:43 +00008849 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008850 if (pnew->str == NULL) {
8851 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008852 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008853 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008854 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008855 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008856 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8857 pnew->length = n;
8858 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008859 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008860 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008861}
8862
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008863PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00008864"unicode(string [, encoding[, errors]]) -> object\n\
8865\n\
8866Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008867encoding defaults to the current default string encoding.\n\
8868errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008869
Guido van Rossumd57fd912000-03-10 22:53:23 +00008870PyTypeObject PyUnicode_Type = {
Martin v. Löwis68192102007-07-21 06:55:02 +00008871 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008872 "unicode", /* tp_name */
8873 sizeof(PyUnicodeObject), /* tp_size */
8874 0, /* tp_itemsize */
8875 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008876 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008877 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008878 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008879 0, /* tp_setattr */
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00008880 0, /* tp_compare */
Georg Brandl347b3002006-03-30 11:57:00 +00008881 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008882 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008883 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008884 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008885 (hashfunc) unicode_hash, /* tp_hash*/
8886 0, /* tp_call*/
8887 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008888 PyObject_GenericGetAttr, /* tp_getattro */
8889 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008890 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008891 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
Neal Norwitzee3a1b52007-02-25 19:44:48 +00008892 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008893 unicode_doc, /* tp_doc */
8894 0, /* tp_traverse */
8895 0, /* tp_clear */
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00008896 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008897 0, /* tp_weaklistoffset */
8898 0, /* tp_iter */
8899 0, /* tp_iternext */
8900 unicode_methods, /* tp_methods */
8901 0, /* tp_members */
8902 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008903 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008904 0, /* tp_dict */
8905 0, /* tp_descr_get */
8906 0, /* tp_descr_set */
8907 0, /* tp_dictoffset */
8908 0, /* tp_init */
8909 0, /* tp_alloc */
8910 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008911 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008912};
8913
8914/* Initialize the Unicode implementation */
8915
Thomas Wouters78890102000-07-22 19:25:51 +00008916void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008917{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008918 int i;
8919
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008920 /* XXX - move this array to unicodectype.c ? */
8921 Py_UNICODE linebreak[] = {
8922 0x000A, /* LINE FEED */
8923 0x000D, /* CARRIAGE RETURN */
8924 0x001C, /* FILE SEPARATOR */
8925 0x001D, /* GROUP SEPARATOR */
8926 0x001E, /* RECORD SEPARATOR */
8927 0x0085, /* NEXT LINE */
8928 0x2028, /* LINE SEPARATOR */
8929 0x2029, /* PARAGRAPH SEPARATOR */
8930 };
8931
Fred Drakee4315f52000-05-09 19:53:39 +00008932 /* Init the implementation */
Christian Heimes5b970ad2008-02-06 13:33:44 +00008933 free_list = NULL;
8934 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008935 unicode_empty = _PyUnicode_New(0);
Neal Norwitze1fdb322006-07-21 05:32:28 +00008936 if (!unicode_empty)
8937 return;
8938
Marc-André Lemburg90e81472000-06-07 09:13:21 +00008939 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008940 for (i = 0; i < 256; i++)
8941 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008942 if (PyType_Ready(&PyUnicode_Type) < 0)
8943 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008944
8945 /* initialize the linebreak bloom filter */
8946 bloom_linebreak = make_bloom_mask(
8947 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8948 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00008949
8950 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008951}
8952
8953/* Finalize the Unicode implementation */
8954
Christian Heimes3b718a72008-02-14 12:47:33 +00008955int
8956PyUnicode_ClearFreeList(void)
8957{
8958 int freelist_size = numfree;
8959 PyUnicodeObject *u;
8960
8961 for (u = free_list; u != NULL;) {
8962 PyUnicodeObject *v = u;
8963 u = *(PyUnicodeObject **)u;
8964 if (v->str)
Neal Norwitz419fd492008-03-17 20:22:43 +00008965 PyObject_DEL(v->str);
Christian Heimes3b718a72008-02-14 12:47:33 +00008966 Py_XDECREF(v->defenc);
8967 PyObject_Del(v);
8968 numfree--;
8969 }
8970 free_list = NULL;
8971 assert(numfree == 0);
8972 return freelist_size;
8973}
8974
Guido van Rossumd57fd912000-03-10 22:53:23 +00008975void
Thomas Wouters78890102000-07-22 19:25:51 +00008976_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008977{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008978 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008979
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008980 Py_XDECREF(unicode_empty);
8981 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008982
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008983 for (i = 0; i < 256; i++) {
8984 if (unicode_latin1[i]) {
8985 Py_DECREF(unicode_latin1[i]);
8986 unicode_latin1[i] = NULL;
8987 }
8988 }
Christian Heimes3b718a72008-02-14 12:47:33 +00008989 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00008990}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008991
Anthony Baxterac6bd462006-04-13 02:06:09 +00008992#ifdef __cplusplus
8993}
8994#endif
8995
8996
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008997/*
8998Local variables:
8999c-basic-offset: 4
9000indent-tabs-mode: nil
9001End:
9002*/