blob: 317d03bc1c83e10d4d8fcfe5a1c8895996ce9310 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Eric Smitha9f7d622008-02-17 19:46:49 +000045#include "formatter_unicode.h"
46
Guido van Rossumd57fd912000-03-10 22:53:23 +000047#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000048#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000049
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000050#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000051#include <windows.h>
52#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000053
Guido van Rossumd57fd912000-03-10 22:53:23 +000054/* Limit for the Unicode object free list */
55
Christian Heimes5b970ad2008-02-06 13:33:44 +000056#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000057
58/* Limit for the Unicode object free list stay alive optimization.
59
60 The implementation will keep allocated Unicode memory intact for
61 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000062 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000063
Christian Heimes5b970ad2008-02-06 13:33:44 +000064 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000065 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000066 malloc()-overhead) bytes of unused garbage.
67
68 Setting the limit to 0 effectively turns the feature off.
69
Guido van Rossumfd4b9572000-04-10 13:51:10 +000070 Note: This is an experimental feature ! If you get core dumps when
71 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000072
73*/
74
Guido van Rossumfd4b9572000-04-10 13:51:10 +000075#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000076
77/* Endianness switches; defaults to little endian */
78
79#ifdef WORDS_BIGENDIAN
80# define BYTEORDER_IS_BIG_ENDIAN
81#else
82# define BYTEORDER_IS_LITTLE_ENDIAN
83#endif
84
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000085/* --- Globals ------------------------------------------------------------
86
87 The globals are initialized by the _PyUnicode_Init() API and should
88 not be used before calling that API.
89
90*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000091
Anthony Baxterac6bd462006-04-13 02:06:09 +000092
93#ifdef __cplusplus
94extern "C" {
95#endif
96
Guido van Rossumd57fd912000-03-10 22:53:23 +000097/* Free list for Unicode objects */
Christian Heimes5b970ad2008-02-06 13:33:44 +000098static PyUnicodeObject *free_list;
99static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000100
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000101/* The empty Unicode object is shared to improve performance. */
102static PyUnicodeObject *unicode_empty;
103
104/* Single character Unicode strings in the Latin-1 range are being
105 shared as well. */
106static PyUnicodeObject *unicode_latin1[256];
107
Fred Drakee4315f52000-05-09 19:53:39 +0000108/* Default encoding to use and assume when NULL is passed as encoding
109 parameter; it is initialized by _PyUnicode_Init().
110
111 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000112 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000113
114*/
Fred Drakee4315f52000-05-09 19:53:39 +0000115static char unicode_default_encoding[100];
116
Christian Heimes4d4f2702008-01-30 11:32:37 +0000117/* Fast detection of the most frequent whitespace characters */
118const unsigned char _Py_ascii_whitespace[] = {
119 0, 0, 0, 0, 0, 0, 0, 0,
120// case 0x0009: /* HORIZONTAL TABULATION */
121// case 0x000A: /* LINE FEED */
122// case 0x000B: /* VERTICAL TABULATION */
123// case 0x000C: /* FORM FEED */
124// case 0x000D: /* CARRIAGE RETURN */
125 0, 1, 1, 1, 1, 1, 0, 0,
126 0, 0, 0, 0, 0, 0, 0, 0,
127// case 0x001C: /* FILE SEPARATOR */
128// case 0x001D: /* GROUP SEPARATOR */
129// case 0x001E: /* RECORD SEPARATOR */
130// case 0x001F: /* UNIT SEPARATOR */
131 0, 0, 0, 0, 1, 1, 1, 1,
132// case 0x0020: /* SPACE */
133 1, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, 0, 0, 0, 0,
137
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0
146};
147
148/* Same for linebreaks */
149static unsigned char ascii_linebreak[] = {
150 0, 0, 0, 0, 0, 0, 0, 0,
151// 0x000A, /* LINE FEED */
152// 0x000D, /* CARRIAGE RETURN */
153 0, 0, 1, 0, 0, 1, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0,
155// 0x001C, /* FILE SEPARATOR */
156// 0x001D, /* GROUP SEPARATOR */
157// 0x001E, /* RECORD SEPARATOR */
158 0, 0, 0, 0, 1, 1, 1, 0,
159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
161 0, 0, 0, 0, 0, 0, 0, 0,
162 0, 0, 0, 0, 0, 0, 0, 0,
163
164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0
172};
173
174
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000175Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000176PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000177{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000178#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000179 return 0x10FFFF;
180#else
181 /* This is actually an illegal character, so it should
182 not be passed to unichr. */
183 return 0xFFFF;
184#endif
185}
186
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000187/* --- Bloom Filters ----------------------------------------------------- */
188
189/* stuff to implement simple "bloom filters" for Unicode characters.
190 to keep things simple, we use a single bitmask, using the least 5
191 bits from each unicode characters as the bit index. */
192
193/* the linebreak mask is set up by Unicode_Init below */
194
195#define BLOOM_MASK unsigned long
196
197static BLOOM_MASK bloom_linebreak;
198
199#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
200
Christian Heimes4d4f2702008-01-30 11:32:37 +0000201#define BLOOM_LINEBREAK(ch) \
202 ((ch) < 128U ? ascii_linebreak[(ch)] : \
203 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000204
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000205Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000206{
207 /* calculate simple bloom-style bitmask for a given unicode string */
208
209 long mask;
210 Py_ssize_t i;
211
212 mask = 0;
213 for (i = 0; i < len; i++)
214 mask |= (1 << (ptr[i] & 0x1F));
215
216 return mask;
217}
218
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000219Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000220{
221 Py_ssize_t i;
222
223 for (i = 0; i < setlen; i++)
224 if (set[i] == chr)
225 return 1;
226
Fredrik Lundh77633512006-05-23 19:47:35 +0000227 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000228}
229
230#define BLOOM_MEMBER(mask, chr, set, setlen)\
231 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
232
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233/* --- Unicode Object ----------------------------------------------------- */
234
235static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000236int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000237 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000238{
239 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000240
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000241 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000243 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000244
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000245 /* Resizing shared object (unicode_empty or single character
246 objects) in-place is not allowed. Use PyUnicode_Resize()
247 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000248
Martin v. Löwis80d2e592006-04-13 06:06:08 +0000249 if (unicode == unicode_empty ||
250 (unicode->length == 1 &&
251 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000252 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000254 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000255 return -1;
256 }
257
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000258 /* We allocate one more byte to make sure the string is Ux0000 terminated.
259 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000260 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000261 it contains). */
262
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 oldstr = unicode->str;
Neal Norwitz419fd492008-03-17 20:22:43 +0000264 unicode->str = PyObject_REALLOC(unicode->str,
265 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 if (!unicode->str) {
Anthony Baxtera6286212006-04-11 07:42:36 +0000267 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000268 PyErr_NoMemory();
269 return -1;
270 }
271 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000272 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000273
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000274 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000275 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000276 if (unicode->defenc) {
277 Py_DECREF(unicode->defenc);
278 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000279 }
280 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000281
Guido van Rossumd57fd912000-03-10 22:53:23 +0000282 return 0;
283}
284
285/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000286 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000287
288 XXX This allocator could further be enhanced by assuring that the
289 free list never reduces its size below 1.
290
291*/
292
293static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000294PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000295{
296 register PyUnicodeObject *unicode;
297
Andrew Dalkee0df7622006-05-27 11:04:36 +0000298 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299 if (length == 0 && unicode_empty != NULL) {
300 Py_INCREF(unicode_empty);
301 return unicode_empty;
302 }
303
304 /* Unicode freelist & memory allocation */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000305 if (free_list) {
306 unicode = free_list;
307 free_list = *(PyUnicodeObject **)unicode;
308 numfree--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000309 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000310 /* Keep-Alive optimization: we only upsize the buffer,
311 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000312 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000313 unicode_resize(unicode, length) < 0) {
Neal Norwitz419fd492008-03-17 20:22:43 +0000314 PyObject_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000315 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000316 }
317 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000318 else {
Neal Norwitz419fd492008-03-17 20:22:43 +0000319 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
320 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000321 }
322 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000323 }
324 else {
Neal Norwitz419fd492008-03-17 20:22:43 +0000325 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000326 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000327 if (unicode == NULL)
328 return NULL;
Neal Norwitz419fd492008-03-17 20:22:43 +0000329 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
330 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000331 }
332
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000333 if (!unicode->str) {
334 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000335 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000336 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000337 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000338 * the caller fails before initializing str -- unicode_resize()
339 * reads str[0], and the Keep-Alive optimization can keep memory
340 * allocated for str alive across a call to unicode_dealloc(unicode).
341 * We don't want unicode_resize to read uninitialized memory in
342 * that case.
343 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000344 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000345 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000346 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000347 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000348 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000349 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000350
351 onError:
352 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000353 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000354 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000355}
356
357static
Guido van Rossum9475a232001-10-05 20:51:39 +0000358void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000359{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000360 if (PyUnicode_CheckExact(unicode) &&
Christian Heimes5b970ad2008-02-06 13:33:44 +0000361 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000362 /* Keep-Alive optimization */
363 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Neal Norwitz419fd492008-03-17 20:22:43 +0000364 PyObject_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000365 unicode->str = NULL;
366 unicode->length = 0;
367 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000368 if (unicode->defenc) {
369 Py_DECREF(unicode->defenc);
370 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000371 }
372 /* Add to free list */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000373 *(PyUnicodeObject **)unicode = free_list;
374 free_list = unicode;
375 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000376 }
377 else {
Neal Norwitz419fd492008-03-17 20:22:43 +0000378 PyObject_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000379 Py_XDECREF(unicode->defenc);
Christian Heimese93237d2007-12-19 02:37:44 +0000380 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000381 }
382}
383
Martin v. Löwis18e16552006-02-15 17:27:45 +0000384int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000385{
386 register PyUnicodeObject *v;
387
388 /* Argument checks */
389 if (unicode == NULL) {
390 PyErr_BadInternalCall();
391 return -1;
392 }
393 v = (PyUnicodeObject *)*unicode;
Christian Heimese93237d2007-12-19 02:37:44 +0000394 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000395 PyErr_BadInternalCall();
396 return -1;
397 }
398
399 /* Resizing unicode_empty and single character objects is not
400 possible since these are being shared. We simply return a fresh
401 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000402 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000403 (v == unicode_empty || v->length == 1)) {
404 PyUnicodeObject *w = _PyUnicode_New(length);
405 if (w == NULL)
406 return -1;
407 Py_UNICODE_COPY(w->str, v->str,
408 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000409 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000410 *unicode = (PyObject *)w;
411 return 0;
412 }
413
414 /* Note that we don't have to modify *unicode for unshared Unicode
415 objects, since we can modify them in-place. */
416 return unicode_resize(v, length);
417}
418
419/* Internal API for use in unicodeobject.c only ! */
420#define _PyUnicode_Resize(unicodevar, length) \
421 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
422
Guido van Rossumd57fd912000-03-10 22:53:23 +0000423PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000424 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000425{
426 PyUnicodeObject *unicode;
427
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000428 /* If the Unicode data is known at construction time, we can apply
429 some optimizations which share commonly used objects. */
430 if (u != NULL) {
431
432 /* Optimization for empty strings */
433 if (size == 0 && unicode_empty != NULL) {
434 Py_INCREF(unicode_empty);
435 return (PyObject *)unicode_empty;
436 }
437
438 /* Single character Unicode objects in the Latin-1 range are
439 shared when using this constructor */
440 if (size == 1 && *u < 256) {
441 unicode = unicode_latin1[*u];
442 if (!unicode) {
443 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000444 if (!unicode)
445 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000446 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000447 unicode_latin1[*u] = unicode;
448 }
449 Py_INCREF(unicode);
450 return (PyObject *)unicode;
451 }
452 }
Tim Petersced69f82003-09-16 20:30:58 +0000453
Guido van Rossumd57fd912000-03-10 22:53:23 +0000454 unicode = _PyUnicode_New(size);
455 if (!unicode)
456 return NULL;
457
458 /* Copy the Unicode data into the new object */
459 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000460 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000461
462 return (PyObject *)unicode;
463}
464
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000465PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
466{
467 PyUnicodeObject *unicode;
468 /* If the Unicode data is known at construction time, we can apply
469 some optimizations which share commonly used objects.
470 Also, this means the input must be UTF-8, so fall back to the
471 UTF-8 decoder at the end. */
472 if (u != NULL) {
473
474 /* Optimization for empty strings */
475 if (size == 0 && unicode_empty != NULL) {
476 Py_INCREF(unicode_empty);
477 return (PyObject *)unicode_empty;
478 }
479
480 /* Single characters are shared when using this constructor.
481 Restrict to ASCII, since the input must be UTF-8. */
482 if (size == 1 && Py_CHARMASK(*u) < 128) {
Neal Norwitz231346e2008-03-27 04:40:50 +0000483 unicode = unicode_latin1[(unsigned)Py_CHARMASK(*u)];
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000484 if (!unicode) {
485 unicode = _PyUnicode_New(1);
486 if (!unicode)
487 return NULL;
488 unicode->str[0] = Py_CHARMASK(*u);
Neal Norwitz231346e2008-03-27 04:40:50 +0000489 unicode_latin1[(unsigned)Py_CHARMASK(*u)] = unicode;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000490 }
491 Py_INCREF(unicode);
492 return (PyObject *)unicode;
493 }
494
495 return PyUnicode_DecodeUTF8(u, size, NULL);
496 }
497
498 unicode = _PyUnicode_New(size);
499 if (!unicode)
500 return NULL;
501
502 return (PyObject *)unicode;
503}
504
505PyObject *PyUnicode_FromString(const char *u)
506{
507 size_t size = strlen(u);
508 if (size > PY_SSIZE_T_MAX) {
509 PyErr_SetString(PyExc_OverflowError, "input too long");
510 return NULL;
511 }
512
513 return PyUnicode_FromStringAndSize(u, size);
514}
515
Guido van Rossumd57fd912000-03-10 22:53:23 +0000516#ifdef HAVE_WCHAR_H
517
518PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000519 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000520{
521 PyUnicodeObject *unicode;
522
523 if (w == NULL) {
524 PyErr_BadInternalCall();
525 return NULL;
526 }
527
528 unicode = _PyUnicode_New(size);
529 if (!unicode)
530 return NULL;
531
532 /* Copy the wchar_t data into the new object */
533#ifdef HAVE_USABLE_WCHAR_T
534 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000535#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000536 {
537 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000538 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000539 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000540 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000541 *u++ = *w++;
542 }
543#endif
544
545 return (PyObject *)unicode;
546}
547
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000548static void
549makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
550{
551 *fmt++ = '%';
552 if (width) {
553 if (zeropad)
554 *fmt++ = '0';
555 fmt += sprintf(fmt, "%d", width);
556 }
557 if (precision)
558 fmt += sprintf(fmt, ".%d", precision);
559 if (longflag)
560 *fmt++ = 'l';
561 else if (size_tflag) {
562 char *f = PY_FORMAT_SIZE_T;
563 while (*f)
564 *fmt++ = *f++;
565 }
566 *fmt++ = c;
567 *fmt = '\0';
568}
569
570#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
571
572PyObject *
573PyUnicode_FromFormatV(const char *format, va_list vargs)
574{
575 va_list count;
576 Py_ssize_t callcount = 0;
577 PyObject **callresults = NULL;
578 PyObject **callresult = NULL;
579 Py_ssize_t n = 0;
580 int width = 0;
581 int precision = 0;
582 int zeropad;
583 const char* f;
584 Py_UNICODE *s;
585 PyObject *string;
586 /* used by sprintf */
587 char buffer[21];
588 /* use abuffer instead of buffer, if we need more space
589 * (which can happen if there's a format specifier with width). */
590 char *abuffer = NULL;
591 char *realbuffer;
592 Py_ssize_t abuffersize = 0;
593 char fmt[60]; /* should be enough for %0width.precisionld */
594 const char *copy;
595
596#ifdef VA_LIST_IS_ARRAY
597 Py_MEMCPY(count, vargs, sizeof(va_list));
598#else
599#ifdef __va_copy
600 __va_copy(count, vargs);
601#else
602 count = vargs;
603#endif
604#endif
605 /* step 1: count the number of %S/%R format specifications
606 * (we call PyObject_Str()/PyObject_Repr() for these objects
607 * once during step 3 and put the result in an array) */
608 for (f = format; *f; f++) {
609 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
610 ++callcount;
611 }
612 /* step 2: allocate memory for the results of
613 * PyObject_Str()/PyObject_Repr() calls */
614 if (callcount) {
Neal Norwitz419fd492008-03-17 20:22:43 +0000615 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000616 if (!callresults) {
617 PyErr_NoMemory();
618 return NULL;
619 }
620 callresult = callresults;
621 }
622 /* step 3: figure out how large a buffer we need */
623 for (f = format; *f; f++) {
624 if (*f == '%') {
625 const char* p = f;
626 width = 0;
Neal Norwitzade57d02008-03-23 06:19:57 +0000627 while (isdigit((unsigned)*f))
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000628 width = (width*10) + *f++ - '0';
Neal Norwitzade57d02008-03-23 06:19:57 +0000629 while (*++f && *f != '%' && !isalpha((unsigned)*f))
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000630 ;
631
632 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
633 * they don't affect the amount of space we reserve.
634 */
635 if ((*f == 'l' || *f == 'z') &&
636 (f[1] == 'd' || f[1] == 'u'))
637 ++f;
638
639 switch (*f) {
640 case 'c':
641 (void)va_arg(count, int);
642 /* fall through... */
643 case '%':
644 n++;
645 break;
646 case 'd': case 'u': case 'i': case 'x':
647 (void) va_arg(count, int);
648 /* 20 bytes is enough to hold a 64-bit
649 integer. Decimal takes the most space.
650 This isn't enough for octal.
651 If a width is specified we need more
652 (which we allocate later). */
653 if (width < 20)
654 width = 20;
655 n += width;
656 if (abuffersize < width)
657 abuffersize = width;
658 break;
659 case 's':
660 {
661 /* UTF-8 */
662 unsigned char*s;
663 s = va_arg(count, unsigned char*);
664 while (*s) {
665 if (*s < 128) {
666 n++; s++;
667 } else if (*s < 0xc0) {
668 /* invalid UTF-8 */
669 n++; s++;
670 } else if (*s < 0xc0) {
671 n++;
672 s++; if(!*s)break;
673 s++;
674 } else if (*s < 0xe0) {
675 n++;
676 s++; if(!*s)break;
677 s++; if(!*s)break;
678 s++;
679 } else {
680 #ifdef Py_UNICODE_WIDE
681 n++;
682 #else
683 n+=2;
684 #endif
685 s++; if(!*s)break;
686 s++; if(!*s)break;
687 s++; if(!*s)break;
688 s++;
689 }
690 }
691 break;
692 }
693 case 'U':
694 {
695 PyObject *obj = va_arg(count, PyObject *);
696 assert(obj && PyUnicode_Check(obj));
697 n += PyUnicode_GET_SIZE(obj);
698 break;
699 }
700 case 'V':
701 {
702 PyObject *obj = va_arg(count, PyObject *);
703 const char *str = va_arg(count, const char *);
704 assert(obj || str);
705 assert(!obj || PyUnicode_Check(obj));
706 if (obj)
707 n += PyUnicode_GET_SIZE(obj);
708 else
709 n += strlen(str);
710 break;
711 }
712 case 'S':
713 {
714 PyObject *obj = va_arg(count, PyObject *);
715 PyObject *str;
716 assert(obj);
717 str = PyObject_Str(obj);
718 if (!str)
719 goto fail;
720 n += PyUnicode_GET_SIZE(str);
721 /* Remember the str and switch to the next slot */
722 *callresult++ = str;
723 break;
724 }
725 case 'R':
726 {
727 PyObject *obj = va_arg(count, PyObject *);
728 PyObject *repr;
729 assert(obj);
730 repr = PyObject_Repr(obj);
731 if (!repr)
732 goto fail;
733 n += PyUnicode_GET_SIZE(repr);
734 /* Remember the repr and switch to the next slot */
735 *callresult++ = repr;
736 break;
737 }
738 case 'p':
739 (void) va_arg(count, int);
740 /* maximum 64-bit pointer representation:
741 * 0xffffffffffffffff
742 * so 19 characters is enough.
743 * XXX I count 18 -- what's the extra for?
744 */
745 n += 19;
746 break;
747 default:
748 /* if we stumble upon an unknown
749 formatting code, copy the rest of
750 the format string to the output
751 string. (we cannot just skip the
752 code, since there's no way to know
753 what's in the argument list) */
754 n += strlen(p);
755 goto expand;
756 }
757 } else
758 n++;
759 }
760 expand:
761 if (abuffersize > 20) {
Neal Norwitz419fd492008-03-17 20:22:43 +0000762 abuffer = PyObject_Malloc(abuffersize);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000763 if (!abuffer) {
764 PyErr_NoMemory();
765 goto fail;
766 }
767 realbuffer = abuffer;
768 }
769 else
770 realbuffer = buffer;
771 /* step 4: fill the buffer */
772 /* Since we've analyzed how much space we need for the worst case,
773 we don't have to resize the string.
774 There can be no errors beyond this point. */
775 string = PyUnicode_FromUnicode(NULL, n);
776 if (!string)
777 goto fail;
778
779 s = PyUnicode_AS_UNICODE(string);
780 callresult = callresults;
781
782 for (f = format; *f; f++) {
783 if (*f == '%') {
784 const char* p = f++;
785 int longflag = 0;
786 int size_tflag = 0;
787 zeropad = (*f == '0');
788 /* parse the width.precision part */
789 width = 0;
Neal Norwitzade57d02008-03-23 06:19:57 +0000790 while (isdigit((unsigned)*f))
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000791 width = (width*10) + *f++ - '0';
792 precision = 0;
793 if (*f == '.') {
794 f++;
Neal Norwitzade57d02008-03-23 06:19:57 +0000795 while (isdigit((unsigned)*f))
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000796 precision = (precision*10) + *f++ - '0';
797 }
798 /* handle the long flag, but only for %ld and %lu.
799 others can be added when necessary. */
800 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
801 longflag = 1;
802 ++f;
803 }
804 /* handle the size_t flag. */
805 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
806 size_tflag = 1;
807 ++f;
808 }
809
810 switch (*f) {
811 case 'c':
812 *s++ = va_arg(vargs, int);
813 break;
814 case 'd':
815 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
816 if (longflag)
817 sprintf(realbuffer, fmt, va_arg(vargs, long));
818 else if (size_tflag)
819 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
820 else
821 sprintf(realbuffer, fmt, va_arg(vargs, int));
822 appendstring(realbuffer);
823 break;
824 case 'u':
825 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
826 if (longflag)
827 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
828 else if (size_tflag)
829 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
830 else
831 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
832 appendstring(realbuffer);
833 break;
834 case 'i':
835 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
836 sprintf(realbuffer, fmt, va_arg(vargs, int));
837 appendstring(realbuffer);
838 break;
839 case 'x':
840 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
841 sprintf(realbuffer, fmt, va_arg(vargs, int));
842 appendstring(realbuffer);
843 break;
844 case 's':
845 {
846 /* Parameter must be UTF-8 encoded.
847 In case of encoding errors, use
848 the replacement character. */
849 PyObject *u;
850 p = va_arg(vargs, char*);
851 u = PyUnicode_DecodeUTF8(p, strlen(p),
852 "replace");
853 if (!u)
854 goto fail;
855 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
856 PyUnicode_GET_SIZE(u));
857 s += PyUnicode_GET_SIZE(u);
858 Py_DECREF(u);
859 break;
860 }
861 case 'U':
862 {
863 PyObject *obj = va_arg(vargs, PyObject *);
864 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
865 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
866 s += size;
867 break;
868 }
869 case 'V':
870 {
871 PyObject *obj = va_arg(vargs, PyObject *);
872 const char *str = va_arg(vargs, const char *);
873 if (obj) {
874 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
875 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
876 s += size;
877 } else {
878 appendstring(str);
879 }
880 break;
881 }
882 case 'S':
883 case 'R':
884 {
885 Py_UNICODE *ucopy;
886 Py_ssize_t usize;
887 Py_ssize_t upos;
888 /* unused, since we already have the result */
889 (void) va_arg(vargs, PyObject *);
890 ucopy = PyUnicode_AS_UNICODE(*callresult);
891 usize = PyUnicode_GET_SIZE(*callresult);
892 for (upos = 0; upos<usize;)
893 *s++ = ucopy[upos++];
894 /* We're done with the unicode()/repr() => forget it */
895 Py_DECREF(*callresult);
896 /* switch to next unicode()/repr() result */
897 ++callresult;
898 break;
899 }
900 case 'p':
901 sprintf(buffer, "%p", va_arg(vargs, void*));
902 /* %p is ill-defined: ensure leading 0x. */
903 if (buffer[1] == 'X')
904 buffer[1] = 'x';
905 else if (buffer[1] != 'x') {
906 memmove(buffer+2, buffer, strlen(buffer)+1);
907 buffer[0] = '0';
908 buffer[1] = 'x';
909 }
910 appendstring(buffer);
911 break;
912 case '%':
913 *s++ = '%';
914 break;
915 default:
916 appendstring(p);
917 goto end;
918 }
919 } else
920 *s++ = *f;
921 }
922
923 end:
924 if (callresults)
Neal Norwitz419fd492008-03-17 20:22:43 +0000925 PyObject_Free(callresults);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000926 if (abuffer)
Neal Norwitz419fd492008-03-17 20:22:43 +0000927 PyObject_Free(abuffer);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000928 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
929 return string;
930 fail:
931 if (callresults) {
932 PyObject **callresult2 = callresults;
933 while (callresult2 < callresult) {
934 Py_DECREF(*callresult2);
935 ++callresult2;
936 }
Neal Norwitz419fd492008-03-17 20:22:43 +0000937 PyObject_Free(callresults);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000938 }
939 if (abuffer)
Neal Norwitz419fd492008-03-17 20:22:43 +0000940 PyObject_Free(abuffer);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000941 return NULL;
942}
943
944#undef appendstring
945
946PyObject *
947PyUnicode_FromFormat(const char *format, ...)
948{
949 PyObject* ret;
950 va_list vargs;
951
952#ifdef HAVE_STDARG_PROTOTYPES
953 va_start(vargs, format);
954#else
955 va_start(vargs);
956#endif
957 ret = PyUnicode_FromFormatV(format, vargs);
958 va_end(vargs);
959 return ret;
960}
961
Martin v. Löwis18e16552006-02-15 17:27:45 +0000962Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
963 wchar_t *w,
964 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000965{
966 if (unicode == NULL) {
967 PyErr_BadInternalCall();
968 return -1;
969 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000970
971 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000972 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000973 size = PyUnicode_GET_SIZE(unicode) + 1;
974
Guido van Rossumd57fd912000-03-10 22:53:23 +0000975#ifdef HAVE_USABLE_WCHAR_T
976 memcpy(w, unicode->str, size * sizeof(wchar_t));
977#else
978 {
979 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000980 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000981 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000982 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000983 *w++ = *u++;
984 }
985#endif
986
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000987 if (size > PyUnicode_GET_SIZE(unicode))
988 return PyUnicode_GET_SIZE(unicode);
989 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000990 return size;
991}
992
993#endif
994
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000995PyObject *PyUnicode_FromOrdinal(int ordinal)
996{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000997 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000998
999#ifdef Py_UNICODE_WIDE
1000 if (ordinal < 0 || ordinal > 0x10ffff) {
1001 PyErr_SetString(PyExc_ValueError,
1002 "unichr() arg not in range(0x110000) "
1003 "(wide Python build)");
1004 return NULL;
1005 }
1006#else
1007 if (ordinal < 0 || ordinal > 0xffff) {
1008 PyErr_SetString(PyExc_ValueError,
1009 "unichr() arg not in range(0x10000) "
1010 "(narrow Python build)");
1011 return NULL;
1012 }
1013#endif
1014
Hye-Shik Chang40574832004-04-06 07:24:51 +00001015 s[0] = (Py_UNICODE)ordinal;
1016 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001017}
1018
Guido van Rossumd57fd912000-03-10 22:53:23 +00001019PyObject *PyUnicode_FromObject(register PyObject *obj)
1020{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001021 /* XXX Perhaps we should make this API an alias of
1022 PyObject_Unicode() instead ?! */
1023 if (PyUnicode_CheckExact(obj)) {
1024 Py_INCREF(obj);
1025 return obj;
1026 }
1027 if (PyUnicode_Check(obj)) {
1028 /* For a Unicode subtype that's not a Unicode object,
1029 return a true Unicode object with the same data. */
1030 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1031 PyUnicode_GET_SIZE(obj));
1032 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001033 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1034}
1035
1036PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1037 const char *encoding,
1038 const char *errors)
1039{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001040 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001041 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001042 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001043
Guido van Rossumd57fd912000-03-10 22:53:23 +00001044 if (obj == NULL) {
1045 PyErr_BadInternalCall();
1046 return NULL;
1047 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001048
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001049#if 0
1050 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001051 that no encodings is given and then redirect to
1052 PyObject_Unicode() which then applies the additional logic for
1053 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001054
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001055 NOTE: This API should really only be used for object which
1056 represent *encoded* Unicode !
1057
1058 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001059 if (PyUnicode_Check(obj)) {
1060 if (encoding) {
1061 PyErr_SetString(PyExc_TypeError,
1062 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001063 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001064 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001065 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001066 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001067#else
1068 if (PyUnicode_Check(obj)) {
1069 PyErr_SetString(PyExc_TypeError,
1070 "decoding Unicode is not supported");
1071 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001072 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001073#endif
1074
1075 /* Coerce object */
1076 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001077 s = PyString_AS_STRING(obj);
1078 len = PyString_GET_SIZE(obj);
Christian Heimes1a6387e2008-03-26 12:49:49 +00001079 }
1080 else if (PyBytes_Check(obj)) {
1081 /* Python 2.x specific */
1082 PyErr_Format(PyExc_TypeError,
1083 "decoding bytearray is not supported");
1084 return NULL;
1085 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001086 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1087 /* Overwrite the error message with something more useful in
1088 case of a TypeError. */
1089 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001090 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001091 "coercing to Unicode: need string or buffer, "
1092 "%.80s found",
Christian Heimese93237d2007-12-19 02:37:44 +00001093 Py_TYPE(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001094 goto onError;
1095 }
Tim Petersced69f82003-09-16 20:30:58 +00001096
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001097 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001098 if (len == 0) {
1099 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001100 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001101 }
Tim Petersced69f82003-09-16 20:30:58 +00001102 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001103 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001104
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001105 return v;
1106
1107 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001108 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001109}
1110
1111PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001112 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001113 const char *encoding,
1114 const char *errors)
1115{
1116 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001117
1118 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001119 encoding = PyUnicode_GetDefaultEncoding();
1120
1121 /* Shortcuts for common default encodings */
1122 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001123 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001124 else if (strcmp(encoding, "latin-1") == 0)
1125 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001126#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1127 else if (strcmp(encoding, "mbcs") == 0)
1128 return PyUnicode_DecodeMBCS(s, size, errors);
1129#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001130 else if (strcmp(encoding, "ascii") == 0)
1131 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001132
1133 /* Decode via the codec registry */
1134 buffer = PyBuffer_FromMemory((void *)s, size);
1135 if (buffer == NULL)
1136 goto onError;
1137 unicode = PyCodec_Decode(buffer, encoding, errors);
1138 if (unicode == NULL)
1139 goto onError;
1140 if (!PyUnicode_Check(unicode)) {
1141 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001142 "decoder did not return an unicode object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001143 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001144 Py_DECREF(unicode);
1145 goto onError;
1146 }
1147 Py_DECREF(buffer);
1148 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001149
Guido van Rossumd57fd912000-03-10 22:53:23 +00001150 onError:
1151 Py_XDECREF(buffer);
1152 return NULL;
1153}
1154
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001155PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1156 const char *encoding,
1157 const char *errors)
1158{
1159 PyObject *v;
1160
1161 if (!PyUnicode_Check(unicode)) {
1162 PyErr_BadArgument();
1163 goto onError;
1164 }
1165
1166 if (encoding == NULL)
1167 encoding = PyUnicode_GetDefaultEncoding();
1168
1169 /* Decode via the codec registry */
1170 v = PyCodec_Decode(unicode, encoding, errors);
1171 if (v == NULL)
1172 goto onError;
1173 return v;
1174
1175 onError:
1176 return NULL;
1177}
1178
Guido van Rossumd57fd912000-03-10 22:53:23 +00001179PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001180 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001181 const char *encoding,
1182 const char *errors)
1183{
1184 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001185
Guido van Rossumd57fd912000-03-10 22:53:23 +00001186 unicode = PyUnicode_FromUnicode(s, size);
1187 if (unicode == NULL)
1188 return NULL;
1189 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1190 Py_DECREF(unicode);
1191 return v;
1192}
1193
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001194PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1195 const char *encoding,
1196 const char *errors)
1197{
1198 PyObject *v;
1199
1200 if (!PyUnicode_Check(unicode)) {
1201 PyErr_BadArgument();
1202 goto onError;
1203 }
1204
1205 if (encoding == NULL)
1206 encoding = PyUnicode_GetDefaultEncoding();
1207
1208 /* Encode via the codec registry */
1209 v = PyCodec_Encode(unicode, encoding, errors);
1210 if (v == NULL)
1211 goto onError;
1212 return v;
1213
1214 onError:
1215 return NULL;
1216}
1217
Guido van Rossumd57fd912000-03-10 22:53:23 +00001218PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1219 const char *encoding,
1220 const char *errors)
1221{
1222 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001223
Guido van Rossumd57fd912000-03-10 22:53:23 +00001224 if (!PyUnicode_Check(unicode)) {
1225 PyErr_BadArgument();
1226 goto onError;
1227 }
Fred Drakee4315f52000-05-09 19:53:39 +00001228
Tim Petersced69f82003-09-16 20:30:58 +00001229 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001230 encoding = PyUnicode_GetDefaultEncoding();
1231
1232 /* Shortcuts for common default encodings */
1233 if (errors == NULL) {
1234 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001235 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001236 else if (strcmp(encoding, "latin-1") == 0)
1237 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001238#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1239 else if (strcmp(encoding, "mbcs") == 0)
1240 return PyUnicode_AsMBCSString(unicode);
1241#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001242 else if (strcmp(encoding, "ascii") == 0)
1243 return PyUnicode_AsASCIIString(unicode);
1244 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001245
1246 /* Encode via the codec registry */
1247 v = PyCodec_Encode(unicode, encoding, errors);
1248 if (v == NULL)
1249 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001250 if (!PyString_Check(v)) {
1251 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001252 "encoder did not return a string object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001253 Py_TYPE(v)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001254 Py_DECREF(v);
1255 goto onError;
1256 }
1257 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001258
Guido van Rossumd57fd912000-03-10 22:53:23 +00001259 onError:
1260 return NULL;
1261}
1262
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001263PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1264 const char *errors)
1265{
1266 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1267
1268 if (v)
1269 return v;
1270 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1271 if (v && errors == NULL)
1272 ((PyUnicodeObject *)unicode)->defenc = v;
1273 return v;
1274}
1275
Guido van Rossumd57fd912000-03-10 22:53:23 +00001276Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1277{
1278 if (!PyUnicode_Check(unicode)) {
1279 PyErr_BadArgument();
1280 goto onError;
1281 }
1282 return PyUnicode_AS_UNICODE(unicode);
1283
1284 onError:
1285 return NULL;
1286}
1287
Martin v. Löwis18e16552006-02-15 17:27:45 +00001288Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001289{
1290 if (!PyUnicode_Check(unicode)) {
1291 PyErr_BadArgument();
1292 goto onError;
1293 }
1294 return PyUnicode_GET_SIZE(unicode);
1295
1296 onError:
1297 return -1;
1298}
1299
Thomas Wouters78890102000-07-22 19:25:51 +00001300const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001301{
1302 return unicode_default_encoding;
1303}
1304
1305int PyUnicode_SetDefaultEncoding(const char *encoding)
1306{
1307 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001308
Fred Drakee4315f52000-05-09 19:53:39 +00001309 /* Make sure the encoding is valid. As side effect, this also
1310 loads the encoding into the codec registry cache. */
1311 v = _PyCodec_Lookup(encoding);
1312 if (v == NULL)
1313 goto onError;
1314 Py_DECREF(v);
1315 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +00001316 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +00001317 sizeof(unicode_default_encoding));
1318 return 0;
1319
1320 onError:
1321 return -1;
1322}
1323
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001324/* error handling callback helper:
1325 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001326 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001327 and adjust various state variables.
1328 return 0 on success, -1 on error
1329*/
1330
1331static
1332int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1333 const char *encoding, const char *reason,
Walter Dörwald87578782007-08-30 15:30:09 +00001334 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1335 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001336 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001337{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001338 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001339
1340 PyObject *restuple = NULL;
1341 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001342 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1343 Py_ssize_t requiredsize;
1344 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001345 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001346 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001347 int res = -1;
1348
1349 if (*errorHandler == NULL) {
1350 *errorHandler = PyCodec_LookupError(errors);
1351 if (*errorHandler == NULL)
1352 goto onError;
1353 }
1354
1355 if (*exceptionObject == NULL) {
1356 *exceptionObject = PyUnicodeDecodeError_Create(
1357 encoding, input, insize, *startinpos, *endinpos, reason);
1358 if (*exceptionObject == NULL)
1359 goto onError;
1360 }
1361 else {
1362 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1363 goto onError;
1364 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1365 goto onError;
1366 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1367 goto onError;
1368 }
1369
1370 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1371 if (restuple == NULL)
1372 goto onError;
1373 if (!PyTuple_Check(restuple)) {
1374 PyErr_Format(PyExc_TypeError, &argparse[4]);
1375 goto onError;
1376 }
1377 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1378 goto onError;
1379 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001380 newpos = insize+newpos;
1381 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001382 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001383 goto onError;
1384 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001385
1386 /* need more space? (at least enough for what we
1387 have+the replacement+the rest of the string (starting
1388 at the new input position), so we won't have to check space
1389 when there are no errors in the rest of the string) */
1390 repptr = PyUnicode_AS_UNICODE(repunicode);
1391 repsize = PyUnicode_GET_SIZE(repunicode);
1392 requiredsize = *outpos + repsize + insize-newpos;
1393 if (requiredsize > outsize) {
1394 if (requiredsize<2*outsize)
1395 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001396 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001397 goto onError;
1398 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1399 }
1400 *endinpos = newpos;
1401 *inptr = input + newpos;
1402 Py_UNICODE_COPY(*outptr, repptr, repsize);
1403 *outptr += repsize;
1404 *outpos += repsize;
1405 /* we made it! */
1406 res = 0;
1407
1408 onError:
1409 Py_XDECREF(restuple);
1410 return res;
1411}
1412
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001413/* --- UTF-7 Codec -------------------------------------------------------- */
1414
1415/* see RFC2152 for details */
1416
Tim Petersced69f82003-09-16 20:30:58 +00001417static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001418char utf7_special[128] = {
1419 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1420 encoded:
1421 0 - not special
1422 1 - special
1423 2 - whitespace (optional)
1424 3 - RFC2152 Set O (optional) */
1425 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1426 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1427 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1428 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1429 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1430 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1431 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1432 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1433
1434};
1435
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001436/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1437 warnings about the comparison always being false; since
1438 utf7_special[0] is 1, we can safely make that one comparison
1439 true */
1440
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001441#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001442 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001443 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001444 (encodeO && (utf7_special[(c)] == 3)))
1445
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001446#define B64(n) \
1447 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1448#define B64CHAR(c) \
1449 (isalnum(c) || (c) == '+' || (c) == '/')
1450#define UB64(c) \
1451 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1452 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001453
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001454#define ENCODE(out, ch, bits) \
1455 while (bits >= 6) { \
1456 *out++ = B64(ch >> (bits-6)); \
1457 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001458 }
1459
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001460#define DECODE(out, ch, bits, surrogate) \
1461 while (bits >= 16) { \
1462 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1463 bits -= 16; \
1464 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001465 /* We have already generated an error for the high surrogate \
1466 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001467 surrogate = 0; \
1468 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001469 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001470 it in a 16-bit character */ \
1471 surrogate = 1; \
1472 errmsg = "code pairs are not supported"; \
1473 goto utf7Error; \
1474 } else { \
1475 *out++ = outCh; \
1476 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001477 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001478
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001479PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001480 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001481 const char *errors)
1482{
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001483 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1484}
1485
1486PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1487 Py_ssize_t size,
1488 const char *errors,
1489 Py_ssize_t *consumed)
1490{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001491 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001492 Py_ssize_t startinpos;
1493 Py_ssize_t endinpos;
1494 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001495 const char *e;
1496 PyUnicodeObject *unicode;
1497 Py_UNICODE *p;
1498 const char *errmsg = "";
1499 int inShift = 0;
1500 unsigned int bitsleft = 0;
1501 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001502 int surrogate = 0;
1503 PyObject *errorHandler = NULL;
1504 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001505
1506 unicode = _PyUnicode_New(size);
1507 if (!unicode)
1508 return NULL;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001509 if (size == 0) {
1510 if (consumed)
1511 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001512 return (PyObject *)unicode;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001513 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001514
1515 p = unicode->str;
1516 e = s + size;
1517
1518 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001519 Py_UNICODE ch;
1520 restart:
1521 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001522
1523 if (inShift) {
1524 if ((ch == '-') || !B64CHAR(ch)) {
1525 inShift = 0;
1526 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001527
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001528 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1529 if (bitsleft >= 6) {
1530 /* The shift sequence has a partial character in it. If
1531 bitsleft < 6 then we could just classify it as padding
1532 but that is not the case here */
1533
1534 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001535 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001536 }
1537 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001538 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001539 here so indicate the potential of a misencoded character. */
1540
1541 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1542 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1543 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001544 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001545 }
1546
1547 if (ch == '-') {
1548 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001549 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001550 inShift = 1;
1551 }
1552 } else if (SPECIAL(ch,0,0)) {
1553 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001554 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001555 } else {
1556 *p++ = ch;
1557 }
1558 } else {
1559 charsleft = (charsleft << 6) | UB64(ch);
1560 bitsleft += 6;
1561 s++;
1562 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1563 }
1564 }
1565 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001566 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001567 s++;
1568 if (s < e && *s == '-') {
1569 s++;
1570 *p++ = '+';
1571 } else
1572 {
1573 inShift = 1;
1574 bitsleft = 0;
1575 }
1576 }
1577 else if (SPECIAL(ch,0,0)) {
Walter Dörwald9d045422007-08-30 15:34:55 +00001578 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001579 errmsg = "unexpected special character";
1580 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001581 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001582 }
1583 else {
1584 *p++ = ch;
1585 s++;
1586 }
1587 continue;
1588 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001589 outpos = p-PyUnicode_AS_UNICODE(unicode);
1590 endinpos = s-starts;
1591 if (unicode_decode_call_errorhandler(
1592 errors, &errorHandler,
1593 "utf7", errmsg,
1594 starts, size, &startinpos, &endinpos, &exc, &s,
1595 (PyObject **)&unicode, &outpos, &p))
1596 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001597 }
1598
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001599 if (inShift && !consumed) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001600 outpos = p-PyUnicode_AS_UNICODE(unicode);
1601 endinpos = size;
1602 if (unicode_decode_call_errorhandler(
1603 errors, &errorHandler,
1604 "utf7", "unterminated shift sequence",
1605 starts, size, &startinpos, &endinpos, &exc, &s,
1606 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001607 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001608 if (s < e)
1609 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001610 }
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001611 if (consumed) {
1612 if(inShift)
1613 *consumed = startinpos;
1614 else
1615 *consumed = s-starts;
1616 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001617
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001618 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001619 goto onError;
1620
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001621 Py_XDECREF(errorHandler);
1622 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001623 return (PyObject *)unicode;
1624
1625onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001626 Py_XDECREF(errorHandler);
1627 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001628 Py_DECREF(unicode);
1629 return NULL;
1630}
1631
1632
1633PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001634 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001635 int encodeSetO,
1636 int encodeWhiteSpace,
1637 const char *errors)
1638{
1639 PyObject *v;
1640 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001641 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001642 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001643 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001644 unsigned int bitsleft = 0;
1645 unsigned long charsleft = 0;
1646 char * out;
1647 char * start;
1648
1649 if (size == 0)
1650 return PyString_FromStringAndSize(NULL, 0);
1651
1652 v = PyString_FromStringAndSize(NULL, cbAllocated);
1653 if (v == NULL)
1654 return NULL;
1655
1656 start = out = PyString_AS_STRING(v);
1657 for (;i < size; ++i) {
1658 Py_UNICODE ch = s[i];
1659
1660 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001661 if (ch == '+') {
1662 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001663 *out++ = '-';
1664 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1665 charsleft = ch;
1666 bitsleft = 16;
1667 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001668 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001669 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001670 } else {
1671 *out++ = (char) ch;
1672 }
1673 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001674 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1675 *out++ = B64(charsleft << (6-bitsleft));
1676 charsleft = 0;
1677 bitsleft = 0;
1678 /* Characters not in the BASE64 set implicitly unshift the sequence
1679 so no '-' is required, except if the character is itself a '-' */
1680 if (B64CHAR(ch) || ch == '-') {
1681 *out++ = '-';
1682 }
1683 inShift = 0;
1684 *out++ = (char) ch;
1685 } else {
1686 bitsleft += 16;
1687 charsleft = (charsleft << 16) | ch;
1688 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1689
1690 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001691 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001692 or '-' then the shift sequence will be terminated implicitly and we
1693 don't have to insert a '-'. */
1694
1695 if (bitsleft == 0) {
1696 if (i + 1 < size) {
1697 Py_UNICODE ch2 = s[i+1];
1698
1699 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001700
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001701 } else if (B64CHAR(ch2) || ch2 == '-') {
1702 *out++ = '-';
1703 inShift = 0;
1704 } else {
1705 inShift = 0;
1706 }
1707
1708 }
1709 else {
1710 *out++ = '-';
1711 inShift = 0;
1712 }
1713 }
Tim Petersced69f82003-09-16 20:30:58 +00001714 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001715 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001716 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001717 if (bitsleft) {
1718 *out++= B64(charsleft << (6-bitsleft) );
1719 *out++ = '-';
1720 }
1721
Tim Peters5de98422002-04-27 18:44:32 +00001722 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001723 return v;
1724}
1725
1726#undef SPECIAL
1727#undef B64
1728#undef B64CHAR
1729#undef UB64
1730#undef ENCODE
1731#undef DECODE
1732
Guido van Rossumd57fd912000-03-10 22:53:23 +00001733/* --- UTF-8 Codec -------------------------------------------------------- */
1734
Tim Petersced69f82003-09-16 20:30:58 +00001735static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001736char utf8_code_length[256] = {
1737 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1738 illegal prefix. see RFC 2279 for details */
1739 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1740 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1741 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1742 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1743 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1744 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1745 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1746 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1747 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1748 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1749 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1750 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1751 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1752 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1753 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1754 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1755};
1756
Guido van Rossumd57fd912000-03-10 22:53:23 +00001757PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001758 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001759 const char *errors)
1760{
Walter Dörwald69652032004-09-07 20:24:22 +00001761 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1762}
1763
1764PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001765 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001766 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001767 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001768{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001769 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001770 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001771 Py_ssize_t startinpos;
1772 Py_ssize_t endinpos;
1773 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001774 const char *e;
1775 PyUnicodeObject *unicode;
1776 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001777 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001778 PyObject *errorHandler = NULL;
1779 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001780
1781 /* Note: size will always be longer than the resulting Unicode
1782 character count */
1783 unicode = _PyUnicode_New(size);
1784 if (!unicode)
1785 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001786 if (size == 0) {
1787 if (consumed)
1788 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001789 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001790 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001791
1792 /* Unpack UTF-8 encoded data */
1793 p = unicode->str;
1794 e = s + size;
1795
1796 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001797 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001798
1799 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001800 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001801 s++;
1802 continue;
1803 }
1804
1805 n = utf8_code_length[ch];
1806
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001807 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001808 if (consumed)
1809 break;
1810 else {
1811 errmsg = "unexpected end of data";
1812 startinpos = s-starts;
1813 endinpos = size;
1814 goto utf8Error;
1815 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001816 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001817
1818 switch (n) {
1819
1820 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001821 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001822 startinpos = s-starts;
1823 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001824 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001825
1826 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001827 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001828 startinpos = s-starts;
1829 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001830 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001831
1832 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001833 if ((s[1] & 0xc0) != 0x80) {
1834 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001835 startinpos = s-starts;
1836 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001837 goto utf8Error;
1838 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001839 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001840 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001841 startinpos = s-starts;
1842 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001843 errmsg = "illegal encoding";
1844 goto utf8Error;
1845 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001846 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001847 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001848 break;
1849
1850 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001851 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001852 (s[2] & 0xc0) != 0x80) {
1853 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001854 startinpos = s-starts;
1855 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001856 goto utf8Error;
1857 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001858 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001859 if (ch < 0x0800) {
1860 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001861 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001862
1863 XXX For wide builds (UCS-4) we should probably try
1864 to recombine the surrogates into a single code
1865 unit.
1866 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001867 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001868 startinpos = s-starts;
1869 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001870 goto utf8Error;
1871 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001872 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001873 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001874 break;
1875
1876 case 4:
1877 if ((s[1] & 0xc0) != 0x80 ||
1878 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001879 (s[3] & 0xc0) != 0x80) {
1880 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001881 startinpos = s-starts;
1882 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001883 goto utf8Error;
1884 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001885 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1886 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1887 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001888 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001889 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001890 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001891 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001892 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001893 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001894 startinpos = s-starts;
1895 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001896 goto utf8Error;
1897 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001898#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001899 *p++ = (Py_UNICODE)ch;
1900#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001901 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001902
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001903 /* translate from 10000..10FFFF to 0..FFFF */
1904 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001905
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001906 /* high surrogate = top 10 bits added to D800 */
1907 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001908
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001909 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001910 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001911#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001912 break;
1913
1914 default:
1915 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001916 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001917 startinpos = s-starts;
1918 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001919 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001920 }
1921 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001922 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001923
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001924 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001925 outpos = p-PyUnicode_AS_UNICODE(unicode);
1926 if (unicode_decode_call_errorhandler(
1927 errors, &errorHandler,
1928 "utf8", errmsg,
1929 starts, size, &startinpos, &endinpos, &exc, &s,
1930 (PyObject **)&unicode, &outpos, &p))
1931 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001932 }
Walter Dörwald69652032004-09-07 20:24:22 +00001933 if (consumed)
1934 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001935
1936 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001937 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001938 goto onError;
1939
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001940 Py_XDECREF(errorHandler);
1941 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001942 return (PyObject *)unicode;
1943
1944onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001945 Py_XDECREF(errorHandler);
1946 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001947 Py_DECREF(unicode);
1948 return NULL;
1949}
1950
Tim Peters602f7402002-04-27 18:03:26 +00001951/* Allocation strategy: if the string is short, convert into a stack buffer
1952 and allocate exactly as much space needed at the end. Else allocate the
1953 maximum possible needed (4 result bytes per Unicode character), and return
1954 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001955*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001956PyObject *
1957PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001958 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001959 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001960{
Tim Peters602f7402002-04-27 18:03:26 +00001961#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001962
Martin v. Löwis18e16552006-02-15 17:27:45 +00001963 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001964 PyObject *v; /* result string object */
1965 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001966 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001967 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001968 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001969
Tim Peters602f7402002-04-27 18:03:26 +00001970 assert(s != NULL);
1971 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001972
Tim Peters602f7402002-04-27 18:03:26 +00001973 if (size <= MAX_SHORT_UNICHARS) {
1974 /* Write into the stack buffer; nallocated can't overflow.
1975 * At the end, we'll allocate exactly as much heap space as it
1976 * turns out we need.
1977 */
1978 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1979 v = NULL; /* will allocate after we're done */
1980 p = stackbuf;
1981 }
1982 else {
1983 /* Overallocate on the heap, and give the excess back at the end. */
1984 nallocated = size * 4;
1985 if (nallocated / 4 != size) /* overflow! */
1986 return PyErr_NoMemory();
1987 v = PyString_FromStringAndSize(NULL, nallocated);
1988 if (v == NULL)
1989 return NULL;
1990 p = PyString_AS_STRING(v);
1991 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001992
Tim Peters602f7402002-04-27 18:03:26 +00001993 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001994 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001995
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001996 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001997 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001998 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001999
Guido van Rossumd57fd912000-03-10 22:53:23 +00002000 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002001 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002002 *p++ = (char)(0xc0 | (ch >> 6));
2003 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002004 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002005 else {
Tim Peters602f7402002-04-27 18:03:26 +00002006 /* Encode UCS2 Unicode ordinals */
2007 if (ch < 0x10000) {
2008 /* Special case: check for high surrogate */
2009 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2010 Py_UCS4 ch2 = s[i];
2011 /* Check for low surrogate and combine the two to
2012 form a UCS4 value */
2013 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002014 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002015 i++;
2016 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002017 }
Tim Peters602f7402002-04-27 18:03:26 +00002018 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002019 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002020 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002021 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2022 *p++ = (char)(0x80 | (ch & 0x3f));
2023 continue;
2024 }
2025encodeUCS4:
2026 /* Encode UCS4 Unicode ordinals */
2027 *p++ = (char)(0xf0 | (ch >> 18));
2028 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2029 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2030 *p++ = (char)(0x80 | (ch & 0x3f));
2031 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002032 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002033
Tim Peters602f7402002-04-27 18:03:26 +00002034 if (v == NULL) {
2035 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002036 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002037 assert(nneeded <= nallocated);
2038 v = PyString_FromStringAndSize(stackbuf, nneeded);
2039 }
2040 else {
2041 /* Cut back to size actually needed. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002042 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002043 assert(nneeded <= nallocated);
2044 _PyString_Resize(&v, nneeded);
2045 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002046 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002047
Tim Peters602f7402002-04-27 18:03:26 +00002048#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002049}
2050
Guido van Rossumd57fd912000-03-10 22:53:23 +00002051PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2052{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002053 if (!PyUnicode_Check(unicode)) {
2054 PyErr_BadArgument();
2055 return NULL;
2056 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002057 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2058 PyUnicode_GET_SIZE(unicode),
2059 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002060}
2061
Walter Dörwald6e390802007-08-17 16:41:28 +00002062/* --- UTF-32 Codec ------------------------------------------------------- */
2063
2064PyObject *
2065PyUnicode_DecodeUTF32(const char *s,
2066 Py_ssize_t size,
2067 const char *errors,
2068 int *byteorder)
2069{
2070 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2071}
2072
2073PyObject *
2074PyUnicode_DecodeUTF32Stateful(const char *s,
2075 Py_ssize_t size,
2076 const char *errors,
2077 int *byteorder,
2078 Py_ssize_t *consumed)
2079{
2080 const char *starts = s;
2081 Py_ssize_t startinpos;
2082 Py_ssize_t endinpos;
2083 Py_ssize_t outpos;
2084 PyUnicodeObject *unicode;
2085 Py_UNICODE *p;
2086#ifndef Py_UNICODE_WIDE
2087 int i, pairs;
2088#else
2089 const int pairs = 0;
2090#endif
2091 const unsigned char *q, *e;
2092 int bo = 0; /* assume native ordering by default */
2093 const char *errmsg = "";
Walter Dörwald20b40d32007-08-17 16:52:50 +00002094 /* Offsets from q for retrieving bytes in the right order. */
2095#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2096 int iorder[] = {0, 1, 2, 3};
2097#else
2098 int iorder[] = {3, 2, 1, 0};
2099#endif
Walter Dörwald9ab80a92007-08-17 16:58:43 +00002100 PyObject *errorHandler = NULL;
2101 PyObject *exc = NULL;
Walter Dörwald6e390802007-08-17 16:41:28 +00002102 /* On narrow builds we split characters outside the BMP into two
2103 codepoints => count how much extra space we need. */
2104#ifndef Py_UNICODE_WIDE
2105 for (i = pairs = 0; i < size/4; i++)
2106 if (((Py_UCS4 *)s)[i] >= 0x10000)
2107 pairs++;
2108#endif
Walter Dörwald6e390802007-08-17 16:41:28 +00002109
2110 /* This might be one to much, because of a BOM */
2111 unicode = _PyUnicode_New((size+3)/4+pairs);
2112 if (!unicode)
2113 return NULL;
2114 if (size == 0)
2115 return (PyObject *)unicode;
2116
2117 /* Unpack UTF-32 encoded data */
2118 p = unicode->str;
2119 q = (unsigned char *)s;
2120 e = q + size;
2121
2122 if (byteorder)
2123 bo = *byteorder;
2124
2125 /* Check for BOM marks (U+FEFF) in the input and adjust current
2126 byte order setting accordingly. In native mode, the leading BOM
2127 mark is skipped, in all other modes, it is copied to the output
2128 stream as-is (giving a ZWNBSP character). */
2129 if (bo == 0) {
2130 if (size >= 4) {
2131 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2132 (q[iorder[1]] << 8) | q[iorder[0]];
2133#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2134 if (bom == 0x0000FEFF) {
2135 q += 4;
2136 bo = -1;
2137 }
2138 else if (bom == 0xFFFE0000) {
2139 q += 4;
2140 bo = 1;
2141 }
2142#else
2143 if (bom == 0x0000FEFF) {
2144 q += 4;
2145 bo = 1;
2146 }
2147 else if (bom == 0xFFFE0000) {
2148 q += 4;
2149 bo = -1;
2150 }
2151#endif
2152 }
2153 }
2154
2155 if (bo == -1) {
2156 /* force LE */
2157 iorder[0] = 0;
2158 iorder[1] = 1;
2159 iorder[2] = 2;
2160 iorder[3] = 3;
2161 }
2162 else if (bo == 1) {
2163 /* force BE */
2164 iorder[0] = 3;
2165 iorder[1] = 2;
2166 iorder[2] = 1;
2167 iorder[3] = 0;
2168 }
2169
2170 while (q < e) {
2171 Py_UCS4 ch;
2172 /* remaining bytes at the end? (size should be divisible by 4) */
2173 if (e-q<4) {
2174 if (consumed)
2175 break;
2176 errmsg = "truncated data";
2177 startinpos = ((const char *)q)-starts;
2178 endinpos = ((const char *)e)-starts;
2179 goto utf32Error;
2180 /* The remaining input chars are ignored if the callback
2181 chooses to skip the input */
2182 }
2183 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2184 (q[iorder[1]] << 8) | q[iorder[0]];
2185
2186 if (ch >= 0x110000)
2187 {
2188 errmsg = "codepoint not in range(0x110000)";
2189 startinpos = ((const char *)q)-starts;
2190 endinpos = startinpos+4;
2191 goto utf32Error;
2192 }
2193#ifndef Py_UNICODE_WIDE
2194 if (ch >= 0x10000)
2195 {
2196 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2197 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2198 }
2199 else
2200#endif
2201 *p++ = ch;
2202 q += 4;
2203 continue;
2204 utf32Error:
2205 outpos = p-PyUnicode_AS_UNICODE(unicode);
2206 if (unicode_decode_call_errorhandler(
2207 errors, &errorHandler,
2208 "utf32", errmsg,
2209 starts, size, &startinpos, &endinpos, &exc, &s,
2210 (PyObject **)&unicode, &outpos, &p))
2211 goto onError;
2212 }
2213
2214 if (byteorder)
2215 *byteorder = bo;
2216
2217 if (consumed)
2218 *consumed = (const char *)q-starts;
2219
2220 /* Adjust length */
2221 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2222 goto onError;
2223
2224 Py_XDECREF(errorHandler);
2225 Py_XDECREF(exc);
2226 return (PyObject *)unicode;
2227
2228onError:
2229 Py_DECREF(unicode);
2230 Py_XDECREF(errorHandler);
2231 Py_XDECREF(exc);
2232 return NULL;
2233}
2234
2235PyObject *
2236PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2237 Py_ssize_t size,
2238 const char *errors,
2239 int byteorder)
2240{
2241 PyObject *v;
2242 unsigned char *p;
2243#ifndef Py_UNICODE_WIDE
2244 int i, pairs;
2245#else
2246 const int pairs = 0;
2247#endif
2248 /* Offsets from p for storing byte pairs in the right order. */
2249#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2250 int iorder[] = {0, 1, 2, 3};
2251#else
2252 int iorder[] = {3, 2, 1, 0};
2253#endif
2254
2255#define STORECHAR(CH) \
2256 do { \
2257 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2258 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2259 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2260 p[iorder[0]] = (CH) & 0xff; \
2261 p += 4; \
2262 } while(0)
2263
2264 /* In narrow builds we can output surrogate pairs as one codepoint,
2265 so we need less space. */
2266#ifndef Py_UNICODE_WIDE
2267 for (i = pairs = 0; i < size-1; i++)
2268 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2269 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2270 pairs++;
2271#endif
2272 v = PyString_FromStringAndSize(NULL,
2273 4 * (size - pairs + (byteorder == 0)));
2274 if (v == NULL)
2275 return NULL;
2276
2277 p = (unsigned char *)PyString_AS_STRING(v);
2278 if (byteorder == 0)
2279 STORECHAR(0xFEFF);
2280 if (size == 0)
2281 return v;
2282
2283 if (byteorder == -1) {
2284 /* force LE */
2285 iorder[0] = 0;
2286 iorder[1] = 1;
2287 iorder[2] = 2;
2288 iorder[3] = 3;
2289 }
2290 else if (byteorder == 1) {
2291 /* force BE */
2292 iorder[0] = 3;
2293 iorder[1] = 2;
2294 iorder[2] = 1;
2295 iorder[3] = 0;
2296 }
2297
2298 while (size-- > 0) {
2299 Py_UCS4 ch = *s++;
2300#ifndef Py_UNICODE_WIDE
2301 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2302 Py_UCS4 ch2 = *s;
2303 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2304 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2305 s++;
2306 size--;
2307 }
2308 }
2309#endif
2310 STORECHAR(ch);
2311 }
2312 return v;
2313#undef STORECHAR
2314}
2315
2316PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2317{
2318 if (!PyUnicode_Check(unicode)) {
2319 PyErr_BadArgument();
2320 return NULL;
2321 }
2322 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2323 PyUnicode_GET_SIZE(unicode),
2324 NULL,
2325 0);
2326}
2327
Guido van Rossumd57fd912000-03-10 22:53:23 +00002328/* --- UTF-16 Codec ------------------------------------------------------- */
2329
Tim Peters772747b2001-08-09 22:21:55 +00002330PyObject *
2331PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002332 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002333 const char *errors,
2334 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002335{
Walter Dörwald69652032004-09-07 20:24:22 +00002336 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2337}
2338
2339PyObject *
2340PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002341 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002342 const char *errors,
2343 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002344 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002345{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002346 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002347 Py_ssize_t startinpos;
2348 Py_ssize_t endinpos;
2349 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002350 PyUnicodeObject *unicode;
2351 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002352 const unsigned char *q, *e;
2353 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002354 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002355 /* Offsets from q for retrieving byte pairs in the right order. */
2356#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2357 int ihi = 1, ilo = 0;
2358#else
2359 int ihi = 0, ilo = 1;
2360#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002361 PyObject *errorHandler = NULL;
2362 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002363
2364 /* Note: size will always be longer than the resulting Unicode
2365 character count */
2366 unicode = _PyUnicode_New(size);
2367 if (!unicode)
2368 return NULL;
2369 if (size == 0)
2370 return (PyObject *)unicode;
2371
2372 /* Unpack UTF-16 encoded data */
2373 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002374 q = (unsigned char *)s;
2375 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002376
2377 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002378 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002379
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002380 /* Check for BOM marks (U+FEFF) in the input and adjust current
2381 byte order setting accordingly. In native mode, the leading BOM
2382 mark is skipped, in all other modes, it is copied to the output
2383 stream as-is (giving a ZWNBSP character). */
2384 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002385 if (size >= 2) {
2386 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002387#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002388 if (bom == 0xFEFF) {
2389 q += 2;
2390 bo = -1;
2391 }
2392 else if (bom == 0xFFFE) {
2393 q += 2;
2394 bo = 1;
2395 }
Tim Petersced69f82003-09-16 20:30:58 +00002396#else
Walter Dörwald69652032004-09-07 20:24:22 +00002397 if (bom == 0xFEFF) {
2398 q += 2;
2399 bo = 1;
2400 }
2401 else if (bom == 0xFFFE) {
2402 q += 2;
2403 bo = -1;
2404 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002405#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002406 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002407 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002408
Tim Peters772747b2001-08-09 22:21:55 +00002409 if (bo == -1) {
2410 /* force LE */
2411 ihi = 1;
2412 ilo = 0;
2413 }
2414 else if (bo == 1) {
2415 /* force BE */
2416 ihi = 0;
2417 ilo = 1;
2418 }
2419
2420 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002421 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002422 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002423 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002424 if (consumed)
2425 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002426 errmsg = "truncated data";
2427 startinpos = ((const char *)q)-starts;
2428 endinpos = ((const char *)e)-starts;
2429 goto utf16Error;
2430 /* The remaining input chars are ignored if the callback
2431 chooses to skip the input */
2432 }
2433 ch = (q[ihi] << 8) | q[ilo];
2434
Tim Peters772747b2001-08-09 22:21:55 +00002435 q += 2;
2436
Guido van Rossumd57fd912000-03-10 22:53:23 +00002437 if (ch < 0xD800 || ch > 0xDFFF) {
2438 *p++ = ch;
2439 continue;
2440 }
2441
2442 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002443 if (q >= e) {
2444 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002445 startinpos = (((const char *)q)-2)-starts;
2446 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002447 goto utf16Error;
2448 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002449 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002450 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2451 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002452 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002453#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002454 *p++ = ch;
2455 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002456#else
2457 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002458#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002459 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002460 }
2461 else {
2462 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002463 startinpos = (((const char *)q)-4)-starts;
2464 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002465 goto utf16Error;
2466 }
2467
Guido van Rossumd57fd912000-03-10 22:53:23 +00002468 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002469 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002470 startinpos = (((const char *)q)-2)-starts;
2471 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002472 /* Fall through to report the error */
2473
2474 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002475 outpos = p-PyUnicode_AS_UNICODE(unicode);
2476 if (unicode_decode_call_errorhandler(
2477 errors, &errorHandler,
2478 "utf16", errmsg,
2479 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2480 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002481 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002482 }
2483
2484 if (byteorder)
2485 *byteorder = bo;
2486
Walter Dörwald69652032004-09-07 20:24:22 +00002487 if (consumed)
2488 *consumed = (const char *)q-starts;
2489
Guido van Rossumd57fd912000-03-10 22:53:23 +00002490 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002491 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002492 goto onError;
2493
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002494 Py_XDECREF(errorHandler);
2495 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002496 return (PyObject *)unicode;
2497
2498onError:
2499 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002500 Py_XDECREF(errorHandler);
2501 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002502 return NULL;
2503}
2504
Tim Peters772747b2001-08-09 22:21:55 +00002505PyObject *
2506PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002507 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002508 const char *errors,
2509 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002510{
2511 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002512 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002513#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00002514 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002515#else
2516 const int pairs = 0;
2517#endif
Tim Peters772747b2001-08-09 22:21:55 +00002518 /* Offsets from p for storing byte pairs in the right order. */
2519#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2520 int ihi = 1, ilo = 0;
2521#else
2522 int ihi = 0, ilo = 1;
2523#endif
2524
2525#define STORECHAR(CH) \
2526 do { \
2527 p[ihi] = ((CH) >> 8) & 0xff; \
2528 p[ilo] = (CH) & 0xff; \
2529 p += 2; \
2530 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002531
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002532#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002533 for (i = pairs = 0; i < size; i++)
2534 if (s[i] >= 0x10000)
2535 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002536#endif
Tim Petersced69f82003-09-16 20:30:58 +00002537 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002538 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002539 if (v == NULL)
2540 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002541
Tim Peters772747b2001-08-09 22:21:55 +00002542 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002543 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002544 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002545 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002546 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002547
2548 if (byteorder == -1) {
2549 /* force LE */
2550 ihi = 1;
2551 ilo = 0;
2552 }
2553 else if (byteorder == 1) {
2554 /* force BE */
2555 ihi = 0;
2556 ilo = 1;
2557 }
2558
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002559 while (size-- > 0) {
2560 Py_UNICODE ch = *s++;
2561 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002562#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002563 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002564 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2565 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002566 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002567#endif
Tim Peters772747b2001-08-09 22:21:55 +00002568 STORECHAR(ch);
2569 if (ch2)
2570 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002571 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002572 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002573#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002574}
2575
2576PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2577{
2578 if (!PyUnicode_Check(unicode)) {
2579 PyErr_BadArgument();
2580 return NULL;
2581 }
2582 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2583 PyUnicode_GET_SIZE(unicode),
2584 NULL,
2585 0);
2586}
2587
2588/* --- Unicode Escape Codec ----------------------------------------------- */
2589
Fredrik Lundh06d12682001-01-24 07:59:11 +00002590static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002591
Guido van Rossumd57fd912000-03-10 22:53:23 +00002592PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002593 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002594 const char *errors)
2595{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002596 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002597 Py_ssize_t startinpos;
2598 Py_ssize_t endinpos;
2599 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002600 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002601 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002602 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002603 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002604 char* message;
2605 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002606 PyObject *errorHandler = NULL;
2607 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002608
Guido van Rossumd57fd912000-03-10 22:53:23 +00002609 /* Escaped strings will always be longer than the resulting
2610 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002611 length after conversion to the true value.
2612 (but if the error callback returns a long replacement string
2613 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002614 v = _PyUnicode_New(size);
2615 if (v == NULL)
2616 goto onError;
2617 if (size == 0)
2618 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002619
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002620 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002621 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002622
Guido van Rossumd57fd912000-03-10 22:53:23 +00002623 while (s < end) {
2624 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002625 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002626 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002627
2628 /* Non-escape characters are interpreted as Unicode ordinals */
2629 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002630 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002631 continue;
2632 }
2633
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002634 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002635 /* \ - Escapes */
2636 s++;
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002637 c = *s++;
2638 if (s > end)
2639 c = '\0'; /* Invalid after \ */
2640 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002641
2642 /* \x escapes */
2643 case '\n': break;
2644 case '\\': *p++ = '\\'; break;
2645 case '\'': *p++ = '\''; break;
2646 case '\"': *p++ = '\"'; break;
2647 case 'b': *p++ = '\b'; break;
2648 case 'f': *p++ = '\014'; break; /* FF */
2649 case 't': *p++ = '\t'; break;
2650 case 'n': *p++ = '\n'; break;
2651 case 'r': *p++ = '\r'; break;
2652 case 'v': *p++ = '\013'; break; /* VT */
2653 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2654
2655 /* \OOO (octal) escapes */
2656 case '0': case '1': case '2': case '3':
2657 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002658 x = s[-1] - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002659 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002660 x = (x<<3) + *s++ - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002661 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002662 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002663 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002664 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002665 break;
2666
Fredrik Lundhccc74732001-02-18 22:13:49 +00002667 /* hex escapes */
2668 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002669 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002670 digits = 2;
2671 message = "truncated \\xXX escape";
2672 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002673
Fredrik Lundhccc74732001-02-18 22:13:49 +00002674 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002675 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002676 digits = 4;
2677 message = "truncated \\uXXXX escape";
2678 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002679
Fredrik Lundhccc74732001-02-18 22:13:49 +00002680 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002681 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002682 digits = 8;
2683 message = "truncated \\UXXXXXXXX escape";
2684 hexescape:
2685 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002686 outpos = p-PyUnicode_AS_UNICODE(v);
2687 if (s+digits>end) {
2688 endinpos = size;
2689 if (unicode_decode_call_errorhandler(
2690 errors, &errorHandler,
2691 "unicodeescape", "end of string in escape sequence",
2692 starts, size, &startinpos, &endinpos, &exc, &s,
2693 (PyObject **)&v, &outpos, &p))
2694 goto onError;
2695 goto nextByte;
2696 }
2697 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002698 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002699 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002700 endinpos = (s+i+1)-starts;
2701 if (unicode_decode_call_errorhandler(
2702 errors, &errorHandler,
2703 "unicodeescape", message,
2704 starts, size, &startinpos, &endinpos, &exc, &s,
2705 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002706 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002707 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002708 }
2709 chr = (chr<<4) & ~0xF;
2710 if (c >= '0' && c <= '9')
2711 chr += c - '0';
2712 else if (c >= 'a' && c <= 'f')
2713 chr += 10 + c - 'a';
2714 else
2715 chr += 10 + c - 'A';
2716 }
2717 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002718 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002719 /* _decoding_error will have already written into the
2720 target buffer. */
2721 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002722 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002723 /* when we get here, chr is a 32-bit unicode character */
2724 if (chr <= 0xffff)
2725 /* UCS-2 character */
2726 *p++ = (Py_UNICODE) chr;
2727 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002728 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002729 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002730#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002731 *p++ = chr;
2732#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002733 chr -= 0x10000L;
2734 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002735 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002736#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002737 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002738 endinpos = s-starts;
2739 outpos = p-PyUnicode_AS_UNICODE(v);
2740 if (unicode_decode_call_errorhandler(
2741 errors, &errorHandler,
2742 "unicodeescape", "illegal Unicode character",
2743 starts, size, &startinpos, &endinpos, &exc, &s,
2744 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002745 goto onError;
2746 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002747 break;
2748
2749 /* \N{name} */
2750 case 'N':
2751 message = "malformed \\N character escape";
2752 if (ucnhash_CAPI == NULL) {
2753 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002754 PyObject *m, *api;
Christian Heimes000a0742008-01-03 22:16:32 +00002755 m = PyImport_ImportModuleNoBlock("unicodedata");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002756 if (m == NULL)
2757 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002758 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002759 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002760 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002761 goto ucnhashError;
Anthony Baxtera6286212006-04-11 07:42:36 +00002762 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002763 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002764 if (ucnhash_CAPI == NULL)
2765 goto ucnhashError;
2766 }
2767 if (*s == '{') {
2768 const char *start = s+1;
2769 /* look for the closing brace */
2770 while (*s != '}' && s < end)
2771 s++;
2772 if (s > start && s < end && *s == '}') {
2773 /* found a name. look it up in the unicode database */
2774 message = "unknown Unicode character name";
2775 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002776 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002777 goto store;
2778 }
2779 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002780 endinpos = s-starts;
2781 outpos = p-PyUnicode_AS_UNICODE(v);
2782 if (unicode_decode_call_errorhandler(
2783 errors, &errorHandler,
2784 "unicodeescape", message,
2785 starts, size, &startinpos, &endinpos, &exc, &s,
2786 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002787 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002788 break;
2789
2790 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002791 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002792 message = "\\ at end of string";
2793 s--;
2794 endinpos = s-starts;
2795 outpos = p-PyUnicode_AS_UNICODE(v);
2796 if (unicode_decode_call_errorhandler(
2797 errors, &errorHandler,
2798 "unicodeescape", message,
2799 starts, size, &startinpos, &endinpos, &exc, &s,
2800 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002801 goto onError;
2802 }
2803 else {
2804 *p++ = '\\';
2805 *p++ = (unsigned char)s[-1];
2806 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002807 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002808 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002809 nextByte:
2810 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002811 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002812 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002813 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002814 Py_XDECREF(errorHandler);
2815 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002816 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002817
Fredrik Lundhccc74732001-02-18 22:13:49 +00002818ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002819 PyErr_SetString(
2820 PyExc_UnicodeError,
2821 "\\N escapes not supported (can't load unicodedata module)"
2822 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002823 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002824 Py_XDECREF(errorHandler);
2825 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002826 return NULL;
2827
Fredrik Lundhccc74732001-02-18 22:13:49 +00002828onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002829 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002830 Py_XDECREF(errorHandler);
2831 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002832 return NULL;
2833}
2834
2835/* Return a Unicode-Escape string version of the Unicode object.
2836
2837 If quotes is true, the string is enclosed in u"" or u'' quotes as
2838 appropriate.
2839
2840*/
2841
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00002842Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Fredrik Lundh95e2a912006-05-26 11:38:15 +00002843 Py_ssize_t size,
2844 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00002845{
2846 /* like wcschr, but doesn't stop at NULL characters */
2847
2848 while (size-- > 0) {
2849 if (*s == ch)
2850 return s;
2851 s++;
2852 }
2853
2854 return NULL;
2855}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002856
Guido van Rossumd57fd912000-03-10 22:53:23 +00002857static
2858PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002859 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002860 int quotes)
2861{
2862 PyObject *repr;
2863 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002864
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002865 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002866
Neal Norwitz17753ec2006-08-21 22:21:19 +00002867 /* XXX(nnorwitz): rather than over-allocating, it would be
2868 better to choose a different scheme. Perhaps scan the
2869 first N-chars of the string and allocate based on that size.
2870 */
2871 /* Initial allocation is based on the longest-possible unichr
2872 escape.
2873
2874 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2875 unichr, so in this case it's the longest unichr escape. In
2876 narrow (UTF-16) builds this is five chars per source unichr
2877 since there are two unichrs in the surrogate pair, so in narrow
2878 (UTF-16) builds it's not the longest unichr escape.
2879
2880 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2881 so in the narrow (UTF-16) build case it's the longest unichr
2882 escape.
2883 */
2884
2885 repr = PyString_FromStringAndSize(NULL,
2886 2
2887#ifdef Py_UNICODE_WIDE
2888 + 10*size
2889#else
2890 + 6*size
2891#endif
2892 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002893 if (repr == NULL)
2894 return NULL;
2895
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002896 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002897
2898 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002899 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00002900 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00002901 !findchar(s, size, '"')) ? '"' : '\'';
2902 }
2903 while (size-- > 0) {
2904 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002905
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002906 /* Escape quotes and backslashes */
2907 if ((quotes &&
2908 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002909 *p++ = '\\';
2910 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00002911 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002912 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002913
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002914#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002915 /* Map 21-bit characters to '\U00xxxxxx' */
2916 else if (ch >= 0x10000) {
2917 *p++ = '\\';
2918 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002919 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2920 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2921 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2922 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2923 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2924 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2925 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002926 *p++ = hexdigit[ch & 0x0000000F];
2927 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002928 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00002929#else
2930 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002931 else if (ch >= 0xD800 && ch < 0xDC00) {
2932 Py_UNICODE ch2;
2933 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002934
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002935 ch2 = *s++;
2936 size--;
2937 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2938 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2939 *p++ = '\\';
2940 *p++ = 'U';
2941 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2942 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2943 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2944 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2945 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2946 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2947 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2948 *p++ = hexdigit[ucs & 0x0000000F];
2949 continue;
2950 }
2951 /* Fall through: isolated surrogates are copied as-is */
2952 s--;
2953 size++;
2954 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00002955#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002956
Guido van Rossumd57fd912000-03-10 22:53:23 +00002957 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002958 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002959 *p++ = '\\';
2960 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002961 *p++ = hexdigit[(ch >> 12) & 0x000F];
2962 *p++ = hexdigit[(ch >> 8) & 0x000F];
2963 *p++ = hexdigit[(ch >> 4) & 0x000F];
2964 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002965 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002966
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002967 /* Map special whitespace to '\t', \n', '\r' */
2968 else if (ch == '\t') {
2969 *p++ = '\\';
2970 *p++ = 't';
2971 }
2972 else if (ch == '\n') {
2973 *p++ = '\\';
2974 *p++ = 'n';
2975 }
2976 else if (ch == '\r') {
2977 *p++ = '\\';
2978 *p++ = 'r';
2979 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002980
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002981 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002982 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002983 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002984 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002985 *p++ = hexdigit[(ch >> 4) & 0x000F];
2986 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002987 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002988
Guido van Rossumd57fd912000-03-10 22:53:23 +00002989 /* Copy everything else as-is */
2990 else
2991 *p++ = (char) ch;
2992 }
2993 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002994 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002995
2996 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002997 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002998 return repr;
2999}
3000
3001PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003002 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003003{
3004 return unicodeescape_string(s, size, 0);
3005}
3006
3007PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3008{
3009 if (!PyUnicode_Check(unicode)) {
3010 PyErr_BadArgument();
3011 return NULL;
3012 }
3013 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3014 PyUnicode_GET_SIZE(unicode));
3015}
3016
3017/* --- Raw Unicode Escape Codec ------------------------------------------- */
3018
3019PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003020 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003021 const char *errors)
3022{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003023 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003024 Py_ssize_t startinpos;
3025 Py_ssize_t endinpos;
3026 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003027 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003028 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003029 const char *end;
3030 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003031 PyObject *errorHandler = NULL;
3032 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003033
Guido van Rossumd57fd912000-03-10 22:53:23 +00003034 /* Escaped strings will always be longer than the resulting
3035 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003036 length after conversion to the true value. (But decoding error
3037 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003038 v = _PyUnicode_New(size);
3039 if (v == NULL)
3040 goto onError;
3041 if (size == 0)
3042 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003043 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003044 end = s + size;
3045 while (s < end) {
3046 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00003047 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003048 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003049 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003050
3051 /* Non-escape characters are interpreted as Unicode ordinals */
3052 if (*s != '\\') {
3053 *p++ = (unsigned char)*s++;
3054 continue;
3055 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003056 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003057
3058 /* \u-escapes are only interpreted iff the number of leading
3059 backslashes if odd */
3060 bs = s;
3061 for (;s < end;) {
3062 if (*s != '\\')
3063 break;
3064 *p++ = (unsigned char)*s++;
3065 }
3066 if (((s - bs) & 1) == 0 ||
3067 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003068 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003069 continue;
3070 }
3071 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003072 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003073 s++;
3074
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003075 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003076 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003077 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003078 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003079 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003080 endinpos = s-starts;
3081 if (unicode_decode_call_errorhandler(
3082 errors, &errorHandler,
3083 "rawunicodeescape", "truncated \\uXXXX",
3084 starts, size, &startinpos, &endinpos, &exc, &s,
3085 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003086 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003087 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003088 }
3089 x = (x<<4) & ~0xF;
3090 if (c >= '0' && c <= '9')
3091 x += c - '0';
3092 else if (c >= 'a' && c <= 'f')
3093 x += 10 + c - 'a';
3094 else
3095 x += 10 + c - 'A';
3096 }
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003097 if (x <= 0xffff)
3098 /* UCS-2 character */
3099 *p++ = (Py_UNICODE) x;
3100 else if (x <= 0x10ffff) {
3101 /* UCS-4 character. Either store directly, or as
3102 surrogate pair. */
3103#ifdef Py_UNICODE_WIDE
Amaury Forgeot d'Arcfac02fa2008-03-24 21:04:10 +00003104 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003105#else
3106 x -= 0x10000L;
3107 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3108 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3109#endif
3110 } else {
3111 endinpos = s-starts;
3112 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003113 if (unicode_decode_call_errorhandler(
3114 errors, &errorHandler,
3115 "rawunicodeescape", "\\Uxxxxxxxx out of range",
3116 starts, size, &startinpos, &endinpos, &exc, &s,
3117 (PyObject **)&v, &outpos, &p))
3118 goto onError;
3119 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003120 nextByte:
3121 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003122 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00003123 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003124 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003125 Py_XDECREF(errorHandler);
3126 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003127 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003128
Guido van Rossumd57fd912000-03-10 22:53:23 +00003129 onError:
3130 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003131 Py_XDECREF(errorHandler);
3132 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003133 return NULL;
3134}
3135
3136PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003137 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003138{
3139 PyObject *repr;
3140 char *p;
3141 char *q;
3142
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003143 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003144
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003145#ifdef Py_UNICODE_WIDE
3146 repr = PyString_FromStringAndSize(NULL, 10 * size);
3147#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003148 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003149#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003150 if (repr == NULL)
3151 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003152 if (size == 0)
3153 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003154
3155 p = q = PyString_AS_STRING(repr);
3156 while (size-- > 0) {
3157 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003158#ifdef Py_UNICODE_WIDE
3159 /* Map 32-bit characters to '\Uxxxxxxxx' */
3160 if (ch >= 0x10000) {
3161 *p++ = '\\';
3162 *p++ = 'U';
3163 *p++ = hexdigit[(ch >> 28) & 0xf];
3164 *p++ = hexdigit[(ch >> 24) & 0xf];
3165 *p++ = hexdigit[(ch >> 20) & 0xf];
3166 *p++ = hexdigit[(ch >> 16) & 0xf];
3167 *p++ = hexdigit[(ch >> 12) & 0xf];
3168 *p++ = hexdigit[(ch >> 8) & 0xf];
3169 *p++ = hexdigit[(ch >> 4) & 0xf];
3170 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003171 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003172 else
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003173#else
3174 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3175 if (ch >= 0xD800 && ch < 0xDC00) {
3176 Py_UNICODE ch2;
3177 Py_UCS4 ucs;
3178
3179 ch2 = *s++;
3180 size--;
3181 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3182 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3183 *p++ = '\\';
3184 *p++ = 'U';
3185 *p++ = hexdigit[(ucs >> 28) & 0xf];
3186 *p++ = hexdigit[(ucs >> 24) & 0xf];
3187 *p++ = hexdigit[(ucs >> 20) & 0xf];
3188 *p++ = hexdigit[(ucs >> 16) & 0xf];
3189 *p++ = hexdigit[(ucs >> 12) & 0xf];
3190 *p++ = hexdigit[(ucs >> 8) & 0xf];
3191 *p++ = hexdigit[(ucs >> 4) & 0xf];
3192 *p++ = hexdigit[ucs & 0xf];
3193 continue;
3194 }
3195 /* Fall through: isolated surrogates are copied as-is */
3196 s--;
3197 size++;
3198 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003199#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003200 /* Map 16-bit characters to '\uxxxx' */
3201 if (ch >= 256) {
3202 *p++ = '\\';
3203 *p++ = 'u';
3204 *p++ = hexdigit[(ch >> 12) & 0xf];
3205 *p++ = hexdigit[(ch >> 8) & 0xf];
3206 *p++ = hexdigit[(ch >> 4) & 0xf];
3207 *p++ = hexdigit[ch & 15];
3208 }
3209 /* Copy everything else as-is */
3210 else
3211 *p++ = (char) ch;
3212 }
3213 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00003214 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003215 return repr;
3216}
3217
3218PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3219{
3220 if (!PyUnicode_Check(unicode)) {
3221 PyErr_BadArgument();
3222 return NULL;
3223 }
3224 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3225 PyUnicode_GET_SIZE(unicode));
3226}
3227
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003228/* --- Unicode Internal Codec ------------------------------------------- */
3229
3230PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003231 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003232 const char *errors)
3233{
3234 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003235 Py_ssize_t startinpos;
3236 Py_ssize_t endinpos;
3237 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003238 PyUnicodeObject *v;
3239 Py_UNICODE *p;
3240 const char *end;
3241 const char *reason;
3242 PyObject *errorHandler = NULL;
3243 PyObject *exc = NULL;
3244
Neal Norwitzd43069c2006-01-08 01:12:10 +00003245#ifdef Py_UNICODE_WIDE
3246 Py_UNICODE unimax = PyUnicode_GetMax();
3247#endif
3248
Armin Rigo7ccbca92006-10-04 12:17:45 +00003249 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003250 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3251 if (v == NULL)
3252 goto onError;
3253 if (PyUnicode_GetSize((PyObject *)v) == 0)
3254 return (PyObject *)v;
3255 p = PyUnicode_AS_UNICODE(v);
3256 end = s + size;
3257
3258 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00003259 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003260 /* We have to sanity check the raw data, otherwise doom looms for
3261 some malformed UCS-4 data. */
3262 if (
3263 #ifdef Py_UNICODE_WIDE
3264 *p > unimax || *p < 0 ||
3265 #endif
3266 end-s < Py_UNICODE_SIZE
3267 )
3268 {
3269 startinpos = s - starts;
3270 if (end-s < Py_UNICODE_SIZE) {
3271 endinpos = end-starts;
3272 reason = "truncated input";
3273 }
3274 else {
3275 endinpos = s - starts + Py_UNICODE_SIZE;
3276 reason = "illegal code point (> 0x10FFFF)";
3277 }
3278 outpos = p - PyUnicode_AS_UNICODE(v);
3279 if (unicode_decode_call_errorhandler(
3280 errors, &errorHandler,
3281 "unicode_internal", reason,
3282 starts, size, &startinpos, &endinpos, &exc, &s,
3283 (PyObject **)&v, &outpos, &p)) {
3284 goto onError;
3285 }
3286 }
3287 else {
3288 p++;
3289 s += Py_UNICODE_SIZE;
3290 }
3291 }
3292
Martin v. Löwis412fb672006-04-13 06:34:32 +00003293 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003294 goto onError;
3295 Py_XDECREF(errorHandler);
3296 Py_XDECREF(exc);
3297 return (PyObject *)v;
3298
3299 onError:
3300 Py_XDECREF(v);
3301 Py_XDECREF(errorHandler);
3302 Py_XDECREF(exc);
3303 return NULL;
3304}
3305
Guido van Rossumd57fd912000-03-10 22:53:23 +00003306/* --- Latin-1 Codec ------------------------------------------------------ */
3307
3308PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003309 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003310 const char *errors)
3311{
3312 PyUnicodeObject *v;
3313 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003314
Guido van Rossumd57fd912000-03-10 22:53:23 +00003315 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003316 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003317 Py_UNICODE r = *(unsigned char*)s;
3318 return PyUnicode_FromUnicode(&r, 1);
3319 }
3320
Guido van Rossumd57fd912000-03-10 22:53:23 +00003321 v = _PyUnicode_New(size);
3322 if (v == NULL)
3323 goto onError;
3324 if (size == 0)
3325 return (PyObject *)v;
3326 p = PyUnicode_AS_UNICODE(v);
3327 while (size-- > 0)
3328 *p++ = (unsigned char)*s++;
3329 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003330
Guido van Rossumd57fd912000-03-10 22:53:23 +00003331 onError:
3332 Py_XDECREF(v);
3333 return NULL;
3334}
3335
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003336/* create or adjust a UnicodeEncodeError */
3337static void make_encode_exception(PyObject **exceptionObject,
3338 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003339 const Py_UNICODE *unicode, Py_ssize_t size,
3340 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003341 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003342{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003343 if (*exceptionObject == NULL) {
3344 *exceptionObject = PyUnicodeEncodeError_Create(
3345 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003346 }
3347 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003348 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3349 goto onError;
3350 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3351 goto onError;
3352 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3353 goto onError;
3354 return;
3355 onError:
3356 Py_DECREF(*exceptionObject);
3357 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003358 }
3359}
3360
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003361/* raises a UnicodeEncodeError */
3362static void raise_encode_exception(PyObject **exceptionObject,
3363 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003364 const Py_UNICODE *unicode, Py_ssize_t size,
3365 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003366 const char *reason)
3367{
3368 make_encode_exception(exceptionObject,
3369 encoding, unicode, size, startpos, endpos, reason);
3370 if (*exceptionObject != NULL)
3371 PyCodec_StrictErrors(*exceptionObject);
3372}
3373
3374/* error handling callback helper:
3375 build arguments, call the callback and check the arguments,
3376 put the result into newpos and return the replacement string, which
3377 has to be freed by the caller */
3378static PyObject *unicode_encode_call_errorhandler(const char *errors,
3379 PyObject **errorHandler,
3380 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003381 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3382 Py_ssize_t startpos, Py_ssize_t endpos,
3383 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003384{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003385 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003386
3387 PyObject *restuple;
3388 PyObject *resunicode;
3389
3390 if (*errorHandler == NULL) {
3391 *errorHandler = PyCodec_LookupError(errors);
3392 if (*errorHandler == NULL)
3393 return NULL;
3394 }
3395
3396 make_encode_exception(exceptionObject,
3397 encoding, unicode, size, startpos, endpos, reason);
3398 if (*exceptionObject == NULL)
3399 return NULL;
3400
3401 restuple = PyObject_CallFunctionObjArgs(
3402 *errorHandler, *exceptionObject, NULL);
3403 if (restuple == NULL)
3404 return NULL;
3405 if (!PyTuple_Check(restuple)) {
3406 PyErr_Format(PyExc_TypeError, &argparse[4]);
3407 Py_DECREF(restuple);
3408 return NULL;
3409 }
3410 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3411 &resunicode, newpos)) {
3412 Py_DECREF(restuple);
3413 return NULL;
3414 }
3415 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003416 *newpos = size+*newpos;
3417 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003418 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003419 Py_DECREF(restuple);
3420 return NULL;
3421 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003422 Py_INCREF(resunicode);
3423 Py_DECREF(restuple);
3424 return resunicode;
3425}
3426
3427static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003428 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003429 const char *errors,
3430 int limit)
3431{
3432 /* output object */
3433 PyObject *res;
3434 /* pointers to the beginning and end+1 of input */
3435 const Py_UNICODE *startp = p;
3436 const Py_UNICODE *endp = p + size;
3437 /* pointer to the beginning of the unencodable characters */
3438 /* const Py_UNICODE *badp = NULL; */
3439 /* pointer into the output */
3440 char *str;
3441 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003442 Py_ssize_t respos = 0;
3443 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00003444 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3445 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003446 PyObject *errorHandler = NULL;
3447 PyObject *exc = NULL;
3448 /* the following variable is used for caching string comparisons
3449 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3450 int known_errorHandler = -1;
3451
3452 /* allocate enough for a simple encoding without
3453 replacements, if we need more, we'll resize */
3454 res = PyString_FromStringAndSize(NULL, size);
3455 if (res == NULL)
3456 goto onError;
3457 if (size == 0)
3458 return res;
3459 str = PyString_AS_STRING(res);
3460 ressize = size;
3461
3462 while (p<endp) {
3463 Py_UNICODE c = *p;
3464
3465 /* can we encode this? */
3466 if (c<limit) {
3467 /* no overflow check, because we know that the space is enough */
3468 *str++ = (char)c;
3469 ++p;
3470 }
3471 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003472 Py_ssize_t unicodepos = p-startp;
3473 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003474 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003475 Py_ssize_t repsize;
3476 Py_ssize_t newpos;
3477 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003478 Py_UNICODE *uni2;
3479 /* startpos for collecting unencodable chars */
3480 const Py_UNICODE *collstart = p;
3481 const Py_UNICODE *collend = p;
3482 /* find all unecodable characters */
3483 while ((collend < endp) && ((*collend)>=limit))
3484 ++collend;
3485 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3486 if (known_errorHandler==-1) {
3487 if ((errors==NULL) || (!strcmp(errors, "strict")))
3488 known_errorHandler = 1;
3489 else if (!strcmp(errors, "replace"))
3490 known_errorHandler = 2;
3491 else if (!strcmp(errors, "ignore"))
3492 known_errorHandler = 3;
3493 else if (!strcmp(errors, "xmlcharrefreplace"))
3494 known_errorHandler = 4;
3495 else
3496 known_errorHandler = 0;
3497 }
3498 switch (known_errorHandler) {
3499 case 1: /* strict */
3500 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3501 goto onError;
3502 case 2: /* replace */
3503 while (collstart++<collend)
3504 *str++ = '?'; /* fall through */
3505 case 3: /* ignore */
3506 p = collend;
3507 break;
3508 case 4: /* xmlcharrefreplace */
3509 respos = str-PyString_AS_STRING(res);
3510 /* determine replacement size (temporarily (mis)uses p) */
3511 for (p = collstart, repsize = 0; p < collend; ++p) {
3512 if (*p<10)
3513 repsize += 2+1+1;
3514 else if (*p<100)
3515 repsize += 2+2+1;
3516 else if (*p<1000)
3517 repsize += 2+3+1;
3518 else if (*p<10000)
3519 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003520#ifndef Py_UNICODE_WIDE
3521 else
3522 repsize += 2+5+1;
3523#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003524 else if (*p<100000)
3525 repsize += 2+5+1;
3526 else if (*p<1000000)
3527 repsize += 2+6+1;
3528 else
3529 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003530#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003531 }
3532 requiredsize = respos+repsize+(endp-collend);
3533 if (requiredsize > ressize) {
3534 if (requiredsize<2*ressize)
3535 requiredsize = 2*ressize;
3536 if (_PyString_Resize(&res, requiredsize))
3537 goto onError;
3538 str = PyString_AS_STRING(res) + respos;
3539 ressize = requiredsize;
3540 }
3541 /* generate replacement (temporarily (mis)uses p) */
3542 for (p = collstart; p < collend; ++p) {
3543 str += sprintf(str, "&#%d;", (int)*p);
3544 }
3545 p = collend;
3546 break;
3547 default:
3548 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3549 encoding, reason, startp, size, &exc,
3550 collstart-startp, collend-startp, &newpos);
3551 if (repunicode == NULL)
3552 goto onError;
3553 /* need more space? (at least enough for what we
3554 have+the replacement+the rest of the string, so
3555 we won't have to check space for encodable characters) */
3556 respos = str-PyString_AS_STRING(res);
3557 repsize = PyUnicode_GET_SIZE(repunicode);
3558 requiredsize = respos+repsize+(endp-collend);
3559 if (requiredsize > ressize) {
3560 if (requiredsize<2*ressize)
3561 requiredsize = 2*ressize;
3562 if (_PyString_Resize(&res, requiredsize)) {
3563 Py_DECREF(repunicode);
3564 goto onError;
3565 }
3566 str = PyString_AS_STRING(res) + respos;
3567 ressize = requiredsize;
3568 }
3569 /* check if there is anything unencodable in the replacement
3570 and copy it to the output */
3571 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3572 c = *uni2;
3573 if (c >= limit) {
3574 raise_encode_exception(&exc, encoding, startp, size,
3575 unicodepos, unicodepos+1, reason);
3576 Py_DECREF(repunicode);
3577 goto onError;
3578 }
3579 *str = (char)c;
3580 }
3581 p = startp + newpos;
3582 Py_DECREF(repunicode);
3583 }
3584 }
3585 }
3586 /* Resize if we allocated to much */
3587 respos = str-PyString_AS_STRING(res);
3588 if (respos<ressize)
3589 /* If this falls res will be NULL */
3590 _PyString_Resize(&res, respos);
3591 Py_XDECREF(errorHandler);
3592 Py_XDECREF(exc);
3593 return res;
3594
3595 onError:
3596 Py_XDECREF(res);
3597 Py_XDECREF(errorHandler);
3598 Py_XDECREF(exc);
3599 return NULL;
3600}
3601
Guido van Rossumd57fd912000-03-10 22:53:23 +00003602PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003603 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003604 const char *errors)
3605{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003606 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003607}
3608
3609PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3610{
3611 if (!PyUnicode_Check(unicode)) {
3612 PyErr_BadArgument();
3613 return NULL;
3614 }
3615 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3616 PyUnicode_GET_SIZE(unicode),
3617 NULL);
3618}
3619
3620/* --- 7-bit ASCII Codec -------------------------------------------------- */
3621
Guido van Rossumd57fd912000-03-10 22:53:23 +00003622PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003623 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003624 const char *errors)
3625{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003626 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003627 PyUnicodeObject *v;
3628 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003629 Py_ssize_t startinpos;
3630 Py_ssize_t endinpos;
3631 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003632 const char *e;
3633 PyObject *errorHandler = NULL;
3634 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003635
Guido van Rossumd57fd912000-03-10 22:53:23 +00003636 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003637 if (size == 1 && *(unsigned char*)s < 128) {
3638 Py_UNICODE r = *(unsigned char*)s;
3639 return PyUnicode_FromUnicode(&r, 1);
3640 }
Tim Petersced69f82003-09-16 20:30:58 +00003641
Guido van Rossumd57fd912000-03-10 22:53:23 +00003642 v = _PyUnicode_New(size);
3643 if (v == NULL)
3644 goto onError;
3645 if (size == 0)
3646 return (PyObject *)v;
3647 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003648 e = s + size;
3649 while (s < e) {
3650 register unsigned char c = (unsigned char)*s;
3651 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003652 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003653 ++s;
3654 }
3655 else {
3656 startinpos = s-starts;
3657 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003658 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003659 if (unicode_decode_call_errorhandler(
3660 errors, &errorHandler,
3661 "ascii", "ordinal not in range(128)",
3662 starts, size, &startinpos, &endinpos, &exc, &s,
3663 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003664 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003665 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003666 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003667 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00003668 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003669 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003670 Py_XDECREF(errorHandler);
3671 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003672 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003673
Guido van Rossumd57fd912000-03-10 22:53:23 +00003674 onError:
3675 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003676 Py_XDECREF(errorHandler);
3677 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003678 return NULL;
3679}
3680
Guido van Rossumd57fd912000-03-10 22:53:23 +00003681PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003682 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003683 const char *errors)
3684{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003685 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003686}
3687
3688PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3689{
3690 if (!PyUnicode_Check(unicode)) {
3691 PyErr_BadArgument();
3692 return NULL;
3693 }
3694 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3695 PyUnicode_GET_SIZE(unicode),
3696 NULL);
3697}
3698
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003699#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003700
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003701/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003702
Martin v. Löwisd8251432006-06-14 05:21:04 +00003703#if SIZEOF_INT < SIZEOF_SSIZE_T
3704#define NEED_RETRY
3705#endif
3706
3707/* XXX This code is limited to "true" double-byte encodings, as
3708 a) it assumes an incomplete character consists of a single byte, and
3709 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3710 encodings, see IsDBCSLeadByteEx documentation. */
3711
3712static int is_dbcs_lead_byte(const char *s, int offset)
3713{
3714 const char *curr = s + offset;
3715
3716 if (IsDBCSLeadByte(*curr)) {
3717 const char *prev = CharPrev(s, curr);
3718 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3719 }
3720 return 0;
3721}
3722
3723/*
3724 * Decode MBCS string into unicode object. If 'final' is set, converts
3725 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3726 */
3727static int decode_mbcs(PyUnicodeObject **v,
3728 const char *s, /* MBCS string */
3729 int size, /* sizeof MBCS string */
3730 int final)
3731{
3732 Py_UNICODE *p;
3733 Py_ssize_t n = 0;
3734 int usize = 0;
3735
3736 assert(size >= 0);
3737
3738 /* Skip trailing lead-byte unless 'final' is set */
3739 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3740 --size;
3741
3742 /* First get the size of the result */
3743 if (size > 0) {
3744 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3745 if (usize == 0) {
3746 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3747 return -1;
3748 }
3749 }
3750
3751 if (*v == NULL) {
3752 /* Create unicode object */
3753 *v = _PyUnicode_New(usize);
3754 if (*v == NULL)
3755 return -1;
3756 }
3757 else {
3758 /* Extend unicode object */
3759 n = PyUnicode_GET_SIZE(*v);
3760 if (_PyUnicode_Resize(v, n + usize) < 0)
3761 return -1;
3762 }
3763
3764 /* Do the conversion */
3765 if (size > 0) {
3766 p = PyUnicode_AS_UNICODE(*v) + n;
3767 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3768 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3769 return -1;
3770 }
3771 }
3772
3773 return size;
3774}
3775
3776PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3777 Py_ssize_t size,
3778 const char *errors,
3779 Py_ssize_t *consumed)
3780{
3781 PyUnicodeObject *v = NULL;
3782 int done;
3783
3784 if (consumed)
3785 *consumed = 0;
3786
3787#ifdef NEED_RETRY
3788 retry:
3789 if (size > INT_MAX)
3790 done = decode_mbcs(&v, s, INT_MAX, 0);
3791 else
3792#endif
3793 done = decode_mbcs(&v, s, (int)size, !consumed);
3794
3795 if (done < 0) {
3796 Py_XDECREF(v);
3797 return NULL;
3798 }
3799
3800 if (consumed)
3801 *consumed += done;
3802
3803#ifdef NEED_RETRY
3804 if (size > INT_MAX) {
3805 s += done;
3806 size -= done;
3807 goto retry;
3808 }
3809#endif
3810
3811 return (PyObject *)v;
3812}
3813
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003814PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003815 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003816 const char *errors)
3817{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003818 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3819}
3820
3821/*
3822 * Convert unicode into string object (MBCS).
3823 * Returns 0 if succeed, -1 otherwise.
3824 */
3825static int encode_mbcs(PyObject **repr,
3826 const Py_UNICODE *p, /* unicode */
3827 int size) /* size of unicode */
3828{
3829 int mbcssize = 0;
3830 Py_ssize_t n = 0;
3831
3832 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003833
3834 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003835 if (size > 0) {
3836 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3837 if (mbcssize == 0) {
3838 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3839 return -1;
3840 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003841 }
3842
Martin v. Löwisd8251432006-06-14 05:21:04 +00003843 if (*repr == NULL) {
3844 /* Create string object */
3845 *repr = PyString_FromStringAndSize(NULL, mbcssize);
3846 if (*repr == NULL)
3847 return -1;
3848 }
3849 else {
3850 /* Extend string object */
3851 n = PyString_Size(*repr);
3852 if (_PyString_Resize(repr, n + mbcssize) < 0)
3853 return -1;
3854 }
3855
3856 /* Do the conversion */
3857 if (size > 0) {
3858 char *s = PyString_AS_STRING(*repr) + n;
3859 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3860 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3861 return -1;
3862 }
3863 }
3864
3865 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003866}
3867
3868PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003869 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003870 const char *errors)
3871{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003872 PyObject *repr = NULL;
3873 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003874
Martin v. Löwisd8251432006-06-14 05:21:04 +00003875#ifdef NEED_RETRY
3876 retry:
3877 if (size > INT_MAX)
3878 ret = encode_mbcs(&repr, p, INT_MAX);
3879 else
3880#endif
3881 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003882
Martin v. Löwisd8251432006-06-14 05:21:04 +00003883 if (ret < 0) {
3884 Py_XDECREF(repr);
3885 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003886 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003887
3888#ifdef NEED_RETRY
3889 if (size > INT_MAX) {
3890 p += INT_MAX;
3891 size -= INT_MAX;
3892 goto retry;
3893 }
3894#endif
3895
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003896 return repr;
3897}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003898
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003899PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3900{
3901 if (!PyUnicode_Check(unicode)) {
3902 PyErr_BadArgument();
3903 return NULL;
3904 }
3905 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3906 PyUnicode_GET_SIZE(unicode),
3907 NULL);
3908}
3909
Martin v. Löwisd8251432006-06-14 05:21:04 +00003910#undef NEED_RETRY
3911
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003912#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003913
Guido van Rossumd57fd912000-03-10 22:53:23 +00003914/* --- Character Mapping Codec -------------------------------------------- */
3915
Guido van Rossumd57fd912000-03-10 22:53:23 +00003916PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003917 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003918 PyObject *mapping,
3919 const char *errors)
3920{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003921 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003922 Py_ssize_t startinpos;
3923 Py_ssize_t endinpos;
3924 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003925 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003926 PyUnicodeObject *v;
3927 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003928 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003929 PyObject *errorHandler = NULL;
3930 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003931 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003932 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003933
Guido van Rossumd57fd912000-03-10 22:53:23 +00003934 /* Default to Latin-1 */
3935 if (mapping == NULL)
3936 return PyUnicode_DecodeLatin1(s, size, errors);
3937
3938 v = _PyUnicode_New(size);
3939 if (v == NULL)
3940 goto onError;
3941 if (size == 0)
3942 return (PyObject *)v;
3943 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003944 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003945 if (PyUnicode_CheckExact(mapping)) {
3946 mapstring = PyUnicode_AS_UNICODE(mapping);
3947 maplen = PyUnicode_GET_SIZE(mapping);
3948 while (s < e) {
3949 unsigned char ch = *s;
3950 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003951
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003952 if (ch < maplen)
3953 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003954
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003955 if (x == 0xfffe) {
3956 /* undefined mapping */
3957 outpos = p-PyUnicode_AS_UNICODE(v);
3958 startinpos = s-starts;
3959 endinpos = startinpos+1;
3960 if (unicode_decode_call_errorhandler(
3961 errors, &errorHandler,
3962 "charmap", "character maps to <undefined>",
3963 starts, size, &startinpos, &endinpos, &exc, &s,
3964 (PyObject **)&v, &outpos, &p)) {
3965 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003966 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003967 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003968 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003969 *p++ = x;
3970 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003971 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003972 }
3973 else {
3974 while (s < e) {
3975 unsigned char ch = *s;
3976 PyObject *w, *x;
3977
3978 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3979 w = PyInt_FromLong((long)ch);
3980 if (w == NULL)
3981 goto onError;
3982 x = PyObject_GetItem(mapping, w);
3983 Py_DECREF(w);
3984 if (x == NULL) {
3985 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3986 /* No mapping found means: mapping is undefined. */
3987 PyErr_Clear();
3988 x = Py_None;
3989 Py_INCREF(x);
3990 } else
3991 goto onError;
3992 }
3993
3994 /* Apply mapping */
3995 if (PyInt_Check(x)) {
3996 long value = PyInt_AS_LONG(x);
3997 if (value < 0 || value > 65535) {
3998 PyErr_SetString(PyExc_TypeError,
3999 "character mapping must be in range(65536)");
4000 Py_DECREF(x);
4001 goto onError;
4002 }
4003 *p++ = (Py_UNICODE)value;
4004 }
4005 else if (x == Py_None) {
4006 /* undefined mapping */
4007 outpos = p-PyUnicode_AS_UNICODE(v);
4008 startinpos = s-starts;
4009 endinpos = startinpos+1;
4010 if (unicode_decode_call_errorhandler(
4011 errors, &errorHandler,
4012 "charmap", "character maps to <undefined>",
4013 starts, size, &startinpos, &endinpos, &exc, &s,
4014 (PyObject **)&v, &outpos, &p)) {
4015 Py_DECREF(x);
4016 goto onError;
4017 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00004018 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004019 continue;
4020 }
4021 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004022 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004023
4024 if (targetsize == 1)
4025 /* 1-1 mapping */
4026 *p++ = *PyUnicode_AS_UNICODE(x);
4027
4028 else if (targetsize > 1) {
4029 /* 1-n mapping */
4030 if (targetsize > extrachars) {
4031 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004032 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4033 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004034 (targetsize << 2);
4035 extrachars += needed;
Armin Rigo7ccbca92006-10-04 12:17:45 +00004036 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004037 if (_PyUnicode_Resize(&v,
4038 PyUnicode_GET_SIZE(v) + needed) < 0) {
4039 Py_DECREF(x);
4040 goto onError;
4041 }
4042 p = PyUnicode_AS_UNICODE(v) + oldpos;
4043 }
4044 Py_UNICODE_COPY(p,
4045 PyUnicode_AS_UNICODE(x),
4046 targetsize);
4047 p += targetsize;
4048 extrachars -= targetsize;
4049 }
4050 /* 1-0 mapping: skip the character */
4051 }
4052 else {
4053 /* wrong return value */
4054 PyErr_SetString(PyExc_TypeError,
4055 "character mapping must return integer, None or unicode");
4056 Py_DECREF(x);
4057 goto onError;
4058 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004059 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004060 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004061 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004062 }
4063 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00004064 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004065 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004066 Py_XDECREF(errorHandler);
4067 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004068 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004069
Guido van Rossumd57fd912000-03-10 22:53:23 +00004070 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004071 Py_XDECREF(errorHandler);
4072 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004073 Py_XDECREF(v);
4074 return NULL;
4075}
4076
Martin v. Löwis3f767792006-06-04 19:36:28 +00004077/* Charmap encoding: the lookup table */
4078
4079struct encoding_map{
4080 PyObject_HEAD
4081 unsigned char level1[32];
4082 int count2, count3;
4083 unsigned char level23[1];
4084};
4085
4086static PyObject*
4087encoding_map_size(PyObject *obj, PyObject* args)
4088{
4089 struct encoding_map *map = (struct encoding_map*)obj;
4090 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4091 128*map->count3);
4092}
4093
4094static PyMethodDef encoding_map_methods[] = {
4095 {"size", encoding_map_size, METH_NOARGS,
4096 PyDoc_STR("Return the size (in bytes) of this object") },
4097 { 0 }
4098};
4099
4100static void
4101encoding_map_dealloc(PyObject* o)
4102{
4103 PyObject_FREE(o);
4104}
4105
4106static PyTypeObject EncodingMapType = {
Martin v. Löwis68192102007-07-21 06:55:02 +00004107 PyVarObject_HEAD_INIT(NULL, 0)
Martin v. Löwis3f767792006-06-04 19:36:28 +00004108 "EncodingMap", /*tp_name*/
4109 sizeof(struct encoding_map), /*tp_basicsize*/
4110 0, /*tp_itemsize*/
4111 /* methods */
4112 encoding_map_dealloc, /*tp_dealloc*/
4113 0, /*tp_print*/
4114 0, /*tp_getattr*/
4115 0, /*tp_setattr*/
4116 0, /*tp_compare*/
4117 0, /*tp_repr*/
4118 0, /*tp_as_number*/
4119 0, /*tp_as_sequence*/
4120 0, /*tp_as_mapping*/
4121 0, /*tp_hash*/
4122 0, /*tp_call*/
4123 0, /*tp_str*/
4124 0, /*tp_getattro*/
4125 0, /*tp_setattro*/
4126 0, /*tp_as_buffer*/
4127 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4128 0, /*tp_doc*/
4129 0, /*tp_traverse*/
4130 0, /*tp_clear*/
4131 0, /*tp_richcompare*/
4132 0, /*tp_weaklistoffset*/
4133 0, /*tp_iter*/
4134 0, /*tp_iternext*/
4135 encoding_map_methods, /*tp_methods*/
4136 0, /*tp_members*/
4137 0, /*tp_getset*/
4138 0, /*tp_base*/
4139 0, /*tp_dict*/
4140 0, /*tp_descr_get*/
4141 0, /*tp_descr_set*/
4142 0, /*tp_dictoffset*/
4143 0, /*tp_init*/
4144 0, /*tp_alloc*/
4145 0, /*tp_new*/
4146 0, /*tp_free*/
4147 0, /*tp_is_gc*/
4148};
4149
4150PyObject*
4151PyUnicode_BuildEncodingMap(PyObject* string)
4152{
4153 Py_UNICODE *decode;
4154 PyObject *result;
4155 struct encoding_map *mresult;
4156 int i;
4157 int need_dict = 0;
4158 unsigned char level1[32];
4159 unsigned char level2[512];
4160 unsigned char *mlevel1, *mlevel2, *mlevel3;
4161 int count2 = 0, count3 = 0;
4162
4163 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4164 PyErr_BadArgument();
4165 return NULL;
4166 }
4167 decode = PyUnicode_AS_UNICODE(string);
4168 memset(level1, 0xFF, sizeof level1);
4169 memset(level2, 0xFF, sizeof level2);
4170
4171 /* If there isn't a one-to-one mapping of NULL to \0,
4172 or if there are non-BMP characters, we need to use
4173 a mapping dictionary. */
4174 if (decode[0] != 0)
4175 need_dict = 1;
4176 for (i = 1; i < 256; i++) {
4177 int l1, l2;
4178 if (decode[i] == 0
4179 #ifdef Py_UNICODE_WIDE
4180 || decode[i] > 0xFFFF
4181 #endif
4182 ) {
4183 need_dict = 1;
4184 break;
4185 }
4186 if (decode[i] == 0xFFFE)
4187 /* unmapped character */
4188 continue;
4189 l1 = decode[i] >> 11;
4190 l2 = decode[i] >> 7;
4191 if (level1[l1] == 0xFF)
4192 level1[l1] = count2++;
4193 if (level2[l2] == 0xFF)
4194 level2[l2] = count3++;
4195 }
4196
4197 if (count2 >= 0xFF || count3 >= 0xFF)
4198 need_dict = 1;
4199
4200 if (need_dict) {
4201 PyObject *result = PyDict_New();
4202 PyObject *key, *value;
4203 if (!result)
4204 return NULL;
4205 for (i = 0; i < 256; i++) {
4206 key = value = NULL;
4207 key = PyInt_FromLong(decode[i]);
4208 value = PyInt_FromLong(i);
4209 if (!key || !value)
4210 goto failed1;
4211 if (PyDict_SetItem(result, key, value) == -1)
4212 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00004213 Py_DECREF(key);
4214 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004215 }
4216 return result;
4217 failed1:
4218 Py_XDECREF(key);
4219 Py_XDECREF(value);
4220 Py_DECREF(result);
4221 return NULL;
4222 }
4223
4224 /* Create a three-level trie */
4225 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4226 16*count2 + 128*count3 - 1);
4227 if (!result)
4228 return PyErr_NoMemory();
4229 PyObject_Init(result, &EncodingMapType);
4230 mresult = (struct encoding_map*)result;
4231 mresult->count2 = count2;
4232 mresult->count3 = count3;
4233 mlevel1 = mresult->level1;
4234 mlevel2 = mresult->level23;
4235 mlevel3 = mresult->level23 + 16*count2;
4236 memcpy(mlevel1, level1, 32);
4237 memset(mlevel2, 0xFF, 16*count2);
4238 memset(mlevel3, 0, 128*count3);
4239 count3 = 0;
4240 for (i = 1; i < 256; i++) {
4241 int o1, o2, o3, i2, i3;
4242 if (decode[i] == 0xFFFE)
4243 /* unmapped character */
4244 continue;
4245 o1 = decode[i]>>11;
4246 o2 = (decode[i]>>7) & 0xF;
4247 i2 = 16*mlevel1[o1] + o2;
4248 if (mlevel2[i2] == 0xFF)
4249 mlevel2[i2] = count3++;
4250 o3 = decode[i] & 0x7F;
4251 i3 = 128*mlevel2[i2] + o3;
4252 mlevel3[i3] = i;
4253 }
4254 return result;
4255}
4256
4257static int
4258encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4259{
4260 struct encoding_map *map = (struct encoding_map*)mapping;
4261 int l1 = c>>11;
4262 int l2 = (c>>7) & 0xF;
4263 int l3 = c & 0x7F;
4264 int i;
4265
4266#ifdef Py_UNICODE_WIDE
4267 if (c > 0xFFFF) {
4268 return -1;
4269 }
4270#endif
4271 if (c == 0)
4272 return 0;
4273 /* level 1*/
4274 i = map->level1[l1];
4275 if (i == 0xFF) {
4276 return -1;
4277 }
4278 /* level 2*/
4279 i = map->level23[16*i+l2];
4280 if (i == 0xFF) {
4281 return -1;
4282 }
4283 /* level 3 */
4284 i = map->level23[16*map->count2 + 128*i + l3];
4285 if (i == 0) {
4286 return -1;
4287 }
4288 return i;
4289}
4290
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004291/* Lookup the character ch in the mapping. If the character
4292 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004293 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004294static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004295{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004296 PyObject *w = PyInt_FromLong((long)c);
4297 PyObject *x;
4298
4299 if (w == NULL)
4300 return NULL;
4301 x = PyObject_GetItem(mapping, w);
4302 Py_DECREF(w);
4303 if (x == NULL) {
4304 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4305 /* No mapping found means: mapping is undefined. */
4306 PyErr_Clear();
4307 x = Py_None;
4308 Py_INCREF(x);
4309 return x;
4310 } else
4311 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004312 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004313 else if (x == Py_None)
4314 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004315 else if (PyInt_Check(x)) {
4316 long value = PyInt_AS_LONG(x);
4317 if (value < 0 || value > 255) {
4318 PyErr_SetString(PyExc_TypeError,
4319 "character mapping must be in range(256)");
4320 Py_DECREF(x);
4321 return NULL;
4322 }
4323 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004324 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004325 else if (PyString_Check(x))
4326 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004327 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004328 /* wrong return value */
4329 PyErr_SetString(PyExc_TypeError,
4330 "character mapping must return integer, None or str");
4331 Py_DECREF(x);
4332 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004333 }
4334}
4335
Martin v. Löwis3f767792006-06-04 19:36:28 +00004336static int
4337charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4338{
4339 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4340 /* exponentially overallocate to minimize reallocations */
4341 if (requiredsize < 2*outsize)
4342 requiredsize = 2*outsize;
4343 if (_PyString_Resize(outobj, requiredsize)) {
4344 return 0;
4345 }
4346 return 1;
4347}
4348
4349typedef enum charmapencode_result {
4350 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4351}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004352/* lookup the character, put the result in the output string and adjust
4353 various state variables. Reallocate the output string if not enough
4354 space is available. Return a new reference to the object that
4355 was put in the output buffer, or Py_None, if the mapping was undefined
4356 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004357 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004358static
Martin v. Löwis3f767792006-06-04 19:36:28 +00004359charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004360 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004361{
Martin v. Löwis3f767792006-06-04 19:36:28 +00004362 PyObject *rep;
4363 char *outstart;
4364 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004365
Christian Heimese93237d2007-12-19 02:37:44 +00004366 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004367 int res = encoding_map_lookup(c, mapping);
4368 Py_ssize_t requiredsize = *outpos+1;
4369 if (res == -1)
4370 return enc_FAILED;
4371 if (outsize<requiredsize)
4372 if (!charmapencode_resize(outobj, outpos, requiredsize))
4373 return enc_EXCEPTION;
4374 outstart = PyString_AS_STRING(*outobj);
4375 outstart[(*outpos)++] = (char)res;
4376 return enc_SUCCESS;
4377 }
4378
4379 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004380 if (rep==NULL)
Martin v. Löwis3f767792006-06-04 19:36:28 +00004381 return enc_EXCEPTION;
4382 else if (rep==Py_None) {
4383 Py_DECREF(rep);
4384 return enc_FAILED;
4385 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004386 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004387 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004388 if (outsize<requiredsize)
4389 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004390 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004391 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004392 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004393 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004394 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4395 }
4396 else {
4397 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004398 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4399 Py_ssize_t requiredsize = *outpos+repsize;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004400 if (outsize<requiredsize)
4401 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004402 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004403 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004404 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004405 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004406 memcpy(outstart + *outpos, repchars, repsize);
4407 *outpos += repsize;
4408 }
4409 }
Georg Brandl9f167602006-06-04 21:46:16 +00004410 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004411 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004412}
4413
4414/* handle an error in PyUnicode_EncodeCharmap
4415 Return 0 on success, -1 on error */
4416static
4417int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004418 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004419 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004420 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004421 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004422{
4423 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004424 Py_ssize_t repsize;
4425 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004426 Py_UNICODE *uni2;
4427 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004428 Py_ssize_t collstartpos = *inpos;
4429 Py_ssize_t collendpos = *inpos+1;
4430 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004431 char *encoding = "charmap";
4432 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00004433 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004434
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004435 /* find all unencodable characters */
4436 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004437 PyObject *rep;
Christian Heimese93237d2007-12-19 02:37:44 +00004438 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004439 int res = encoding_map_lookup(p[collendpos], mapping);
4440 if (res != -1)
4441 break;
4442 ++collendpos;
4443 continue;
4444 }
4445
4446 rep = charmapencode_lookup(p[collendpos], mapping);
4447 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004448 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004449 else if (rep!=Py_None) {
4450 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004451 break;
4452 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004453 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004454 ++collendpos;
4455 }
4456 /* cache callback name lookup
4457 * (if not done yet, i.e. it's the first error) */
4458 if (*known_errorHandler==-1) {
4459 if ((errors==NULL) || (!strcmp(errors, "strict")))
4460 *known_errorHandler = 1;
4461 else if (!strcmp(errors, "replace"))
4462 *known_errorHandler = 2;
4463 else if (!strcmp(errors, "ignore"))
4464 *known_errorHandler = 3;
4465 else if (!strcmp(errors, "xmlcharrefreplace"))
4466 *known_errorHandler = 4;
4467 else
4468 *known_errorHandler = 0;
4469 }
4470 switch (*known_errorHandler) {
4471 case 1: /* strict */
4472 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4473 return -1;
4474 case 2: /* replace */
4475 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4476 x = charmapencode_output('?', mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004477 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004478 return -1;
4479 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004480 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004481 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4482 return -1;
4483 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004484 }
4485 /* fall through */
4486 case 3: /* ignore */
4487 *inpos = collendpos;
4488 break;
4489 case 4: /* xmlcharrefreplace */
4490 /* generate replacement (temporarily (mis)uses p) */
4491 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4492 char buffer[2+29+1+1];
4493 char *cp;
4494 sprintf(buffer, "&#%d;", (int)p[collpos]);
4495 for (cp = buffer; *cp; ++cp) {
4496 x = charmapencode_output(*cp, mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004497 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004498 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004499 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004500 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4501 return -1;
4502 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004503 }
4504 }
4505 *inpos = collendpos;
4506 break;
4507 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004508 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004509 encoding, reason, p, size, exceptionObject,
4510 collstartpos, collendpos, &newpos);
4511 if (repunicode == NULL)
4512 return -1;
4513 /* generate replacement */
4514 repsize = PyUnicode_GET_SIZE(repunicode);
4515 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4516 x = charmapencode_output(*uni2, mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004517 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004518 return -1;
4519 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004520 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004521 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004522 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4523 return -1;
4524 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004525 }
4526 *inpos = newpos;
4527 Py_DECREF(repunicode);
4528 }
4529 return 0;
4530}
4531
Guido van Rossumd57fd912000-03-10 22:53:23 +00004532PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004533 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004534 PyObject *mapping,
4535 const char *errors)
4536{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004537 /* output object */
4538 PyObject *res = NULL;
4539 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004540 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004541 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004542 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004543 PyObject *errorHandler = NULL;
4544 PyObject *exc = NULL;
4545 /* the following variable is used for caching string comparisons
4546 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4547 * 3=ignore, 4=xmlcharrefreplace */
4548 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004549
4550 /* Default to Latin-1 */
4551 if (mapping == NULL)
4552 return PyUnicode_EncodeLatin1(p, size, errors);
4553
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004554 /* allocate enough for a simple encoding without
4555 replacements, if we need more, we'll resize */
4556 res = PyString_FromStringAndSize(NULL, size);
4557 if (res == NULL)
4558 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004559 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004560 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004561
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004562 while (inpos<size) {
4563 /* try to encode it */
Martin v. Löwis3f767792006-06-04 19:36:28 +00004564 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4565 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004566 goto onError;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004567 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004568 if (charmap_encoding_error(p, size, &inpos, mapping,
4569 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004570 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00004571 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004572 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004573 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004574 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004575 else
4576 /* done with this character => adjust input position */
4577 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004578 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004579
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004580 /* Resize if we allocated to much */
4581 if (respos<PyString_GET_SIZE(res)) {
4582 if (_PyString_Resize(&res, respos))
4583 goto onError;
4584 }
4585 Py_XDECREF(exc);
4586 Py_XDECREF(errorHandler);
4587 return res;
4588
4589 onError:
4590 Py_XDECREF(res);
4591 Py_XDECREF(exc);
4592 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004593 return NULL;
4594}
4595
4596PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4597 PyObject *mapping)
4598{
4599 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4600 PyErr_BadArgument();
4601 return NULL;
4602 }
4603 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4604 PyUnicode_GET_SIZE(unicode),
4605 mapping,
4606 NULL);
4607}
4608
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004609/* create or adjust a UnicodeTranslateError */
4610static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004611 const Py_UNICODE *unicode, Py_ssize_t size,
4612 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004613 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004614{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004615 if (*exceptionObject == NULL) {
4616 *exceptionObject = PyUnicodeTranslateError_Create(
4617 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004618 }
4619 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004620 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4621 goto onError;
4622 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4623 goto onError;
4624 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4625 goto onError;
4626 return;
4627 onError:
4628 Py_DECREF(*exceptionObject);
4629 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004630 }
4631}
4632
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004633/* raises a UnicodeTranslateError */
4634static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004635 const Py_UNICODE *unicode, Py_ssize_t size,
4636 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004637 const char *reason)
4638{
4639 make_translate_exception(exceptionObject,
4640 unicode, size, startpos, endpos, reason);
4641 if (*exceptionObject != NULL)
4642 PyCodec_StrictErrors(*exceptionObject);
4643}
4644
4645/* error handling callback helper:
4646 build arguments, call the callback and check the arguments,
4647 put the result into newpos and return the replacement string, which
4648 has to be freed by the caller */
4649static PyObject *unicode_translate_call_errorhandler(const char *errors,
4650 PyObject **errorHandler,
4651 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004652 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4653 Py_ssize_t startpos, Py_ssize_t endpos,
4654 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004655{
Martin v. Löwis412fb672006-04-13 06:34:32 +00004656 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004657
Martin v. Löwis412fb672006-04-13 06:34:32 +00004658 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004659 PyObject *restuple;
4660 PyObject *resunicode;
4661
4662 if (*errorHandler == NULL) {
4663 *errorHandler = PyCodec_LookupError(errors);
4664 if (*errorHandler == NULL)
4665 return NULL;
4666 }
4667
4668 make_translate_exception(exceptionObject,
4669 unicode, size, startpos, endpos, reason);
4670 if (*exceptionObject == NULL)
4671 return NULL;
4672
4673 restuple = PyObject_CallFunctionObjArgs(
4674 *errorHandler, *exceptionObject, NULL);
4675 if (restuple == NULL)
4676 return NULL;
4677 if (!PyTuple_Check(restuple)) {
4678 PyErr_Format(PyExc_TypeError, &argparse[4]);
4679 Py_DECREF(restuple);
4680 return NULL;
4681 }
4682 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004683 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004684 Py_DECREF(restuple);
4685 return NULL;
4686 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004687 if (i_newpos<0)
4688 *newpos = size+i_newpos;
4689 else
4690 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004691 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004692 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004693 Py_DECREF(restuple);
4694 return NULL;
4695 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004696 Py_INCREF(resunicode);
4697 Py_DECREF(restuple);
4698 return resunicode;
4699}
4700
4701/* Lookup the character ch in the mapping and put the result in result,
4702 which must be decrefed by the caller.
4703 Return 0 on success, -1 on error */
4704static
4705int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4706{
4707 PyObject *w = PyInt_FromLong((long)c);
4708 PyObject *x;
4709
4710 if (w == NULL)
4711 return -1;
4712 x = PyObject_GetItem(mapping, w);
4713 Py_DECREF(w);
4714 if (x == NULL) {
4715 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4716 /* No mapping found means: use 1:1 mapping. */
4717 PyErr_Clear();
4718 *result = NULL;
4719 return 0;
4720 } else
4721 return -1;
4722 }
4723 else if (x == Py_None) {
4724 *result = x;
4725 return 0;
4726 }
4727 else if (PyInt_Check(x)) {
4728 long value = PyInt_AS_LONG(x);
4729 long max = PyUnicode_GetMax();
4730 if (value < 0 || value > max) {
4731 PyErr_Format(PyExc_TypeError,
4732 "character mapping must be in range(0x%lx)", max+1);
4733 Py_DECREF(x);
4734 return -1;
4735 }
4736 *result = x;
4737 return 0;
4738 }
4739 else if (PyUnicode_Check(x)) {
4740 *result = x;
4741 return 0;
4742 }
4743 else {
4744 /* wrong return value */
4745 PyErr_SetString(PyExc_TypeError,
4746 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004747 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004748 return -1;
4749 }
4750}
4751/* ensure that *outobj is at least requiredsize characters long,
4752if not reallocate and adjust various state variables.
4753Return 0 on success, -1 on error */
4754static
Walter Dörwald4894c302003-10-24 14:25:28 +00004755int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004756 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004757{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004758 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004759 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004760 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004761 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004762 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004763 if (requiredsize < 2 * oldsize)
4764 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004765 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004766 return -1;
4767 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004768 }
4769 return 0;
4770}
4771/* lookup the character, put the result in the output string and adjust
4772 various state variables. Return a new reference to the object that
4773 was put in the output buffer in *result, or Py_None, if the mapping was
4774 undefined (in which case no character was written).
4775 The called must decref result.
4776 Return 0 on success, -1 on error. */
4777static
Walter Dörwald4894c302003-10-24 14:25:28 +00004778int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004779 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004780 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004781{
Walter Dörwald4894c302003-10-24 14:25:28 +00004782 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004783 return -1;
4784 if (*res==NULL) {
4785 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004786 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004787 }
4788 else if (*res==Py_None)
4789 ;
4790 else if (PyInt_Check(*res)) {
4791 /* no overflow check, because we know that the space is enough */
4792 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4793 }
4794 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004795 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004796 if (repsize==1) {
4797 /* no overflow check, because we know that the space is enough */
4798 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4799 }
4800 else if (repsize!=0) {
4801 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004802 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004803 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004804 repsize - 1;
4805 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004806 return -1;
4807 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4808 *outp += repsize;
4809 }
4810 }
4811 else
4812 return -1;
4813 return 0;
4814}
4815
4816PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004817 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004818 PyObject *mapping,
4819 const char *errors)
4820{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004821 /* output object */
4822 PyObject *res = NULL;
4823 /* pointers to the beginning and end+1 of input */
4824 const Py_UNICODE *startp = p;
4825 const Py_UNICODE *endp = p + size;
4826 /* pointer into the output */
4827 Py_UNICODE *str;
4828 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004829 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004830 char *reason = "character maps to <undefined>";
4831 PyObject *errorHandler = NULL;
4832 PyObject *exc = NULL;
4833 /* the following variable is used for caching string comparisons
4834 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4835 * 3=ignore, 4=xmlcharrefreplace */
4836 int known_errorHandler = -1;
4837
Guido van Rossumd57fd912000-03-10 22:53:23 +00004838 if (mapping == NULL) {
4839 PyErr_BadArgument();
4840 return NULL;
4841 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004842
4843 /* allocate enough for a simple 1:1 translation without
4844 replacements, if we need more, we'll resize */
4845 res = PyUnicode_FromUnicode(NULL, size);
4846 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004847 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004848 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004849 return res;
4850 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004851
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004852 while (p<endp) {
4853 /* try to encode it */
4854 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004855 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004856 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004857 goto onError;
4858 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004859 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004860 if (x!=Py_None) /* it worked => adjust input pointer */
4861 ++p;
4862 else { /* untranslatable character */
4863 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004864 Py_ssize_t repsize;
4865 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004866 Py_UNICODE *uni2;
4867 /* startpos for collecting untranslatable chars */
4868 const Py_UNICODE *collstart = p;
4869 const Py_UNICODE *collend = p+1;
4870 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004871
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004872 /* find all untranslatable characters */
4873 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004874 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004875 goto onError;
4876 Py_XDECREF(x);
4877 if (x!=Py_None)
4878 break;
4879 ++collend;
4880 }
4881 /* cache callback name lookup
4882 * (if not done yet, i.e. it's the first error) */
4883 if (known_errorHandler==-1) {
4884 if ((errors==NULL) || (!strcmp(errors, "strict")))
4885 known_errorHandler = 1;
4886 else if (!strcmp(errors, "replace"))
4887 known_errorHandler = 2;
4888 else if (!strcmp(errors, "ignore"))
4889 known_errorHandler = 3;
4890 else if (!strcmp(errors, "xmlcharrefreplace"))
4891 known_errorHandler = 4;
4892 else
4893 known_errorHandler = 0;
4894 }
4895 switch (known_errorHandler) {
4896 case 1: /* strict */
4897 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4898 goto onError;
4899 case 2: /* replace */
4900 /* No need to check for space, this is a 1:1 replacement */
4901 for (coll = collstart; coll<collend; ++coll)
4902 *str++ = '?';
4903 /* fall through */
4904 case 3: /* ignore */
4905 p = collend;
4906 break;
4907 case 4: /* xmlcharrefreplace */
4908 /* generate replacement (temporarily (mis)uses p) */
4909 for (p = collstart; p < collend; ++p) {
4910 char buffer[2+29+1+1];
4911 char *cp;
4912 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004913 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004914 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4915 goto onError;
4916 for (cp = buffer; *cp; ++cp)
4917 *str++ = *cp;
4918 }
4919 p = collend;
4920 break;
4921 default:
4922 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4923 reason, startp, size, &exc,
4924 collstart-startp, collend-startp, &newpos);
4925 if (repunicode == NULL)
4926 goto onError;
4927 /* generate replacement */
4928 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004929 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004930 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4931 Py_DECREF(repunicode);
4932 goto onError;
4933 }
4934 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4935 *str++ = *uni2;
4936 p = startp + newpos;
4937 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004938 }
4939 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004940 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004941 /* Resize if we allocated to much */
4942 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004943 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004944 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004945 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004946 }
4947 Py_XDECREF(exc);
4948 Py_XDECREF(errorHandler);
4949 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004950
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004951 onError:
4952 Py_XDECREF(res);
4953 Py_XDECREF(exc);
4954 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004955 return NULL;
4956}
4957
4958PyObject *PyUnicode_Translate(PyObject *str,
4959 PyObject *mapping,
4960 const char *errors)
4961{
4962 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004963
Guido van Rossumd57fd912000-03-10 22:53:23 +00004964 str = PyUnicode_FromObject(str);
4965 if (str == NULL)
4966 goto onError;
4967 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4968 PyUnicode_GET_SIZE(str),
4969 mapping,
4970 errors);
4971 Py_DECREF(str);
4972 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004973
Guido van Rossumd57fd912000-03-10 22:53:23 +00004974 onError:
4975 Py_XDECREF(str);
4976 return NULL;
4977}
Tim Petersced69f82003-09-16 20:30:58 +00004978
Guido van Rossum9e896b32000-04-05 20:11:21 +00004979/* --- Decimal Encoder ---------------------------------------------------- */
4980
4981int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004982 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004983 char *output,
4984 const char *errors)
4985{
4986 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004987 PyObject *errorHandler = NULL;
4988 PyObject *exc = NULL;
4989 const char *encoding = "decimal";
4990 const char *reason = "invalid decimal Unicode string";
4991 /* the following variable is used for caching string comparisons
4992 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4993 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004994
4995 if (output == NULL) {
4996 PyErr_BadArgument();
4997 return -1;
4998 }
4999
5000 p = s;
5001 end = s + length;
5002 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005003 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005004 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005005 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005006 Py_ssize_t repsize;
5007 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005008 Py_UNICODE *uni2;
5009 Py_UNICODE *collstart;
5010 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005011
Guido van Rossum9e896b32000-04-05 20:11:21 +00005012 if (Py_UNICODE_ISSPACE(ch)) {
5013 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005014 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005015 continue;
5016 }
5017 decimal = Py_UNICODE_TODECIMAL(ch);
5018 if (decimal >= 0) {
5019 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005020 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005021 continue;
5022 }
Guido van Rossumba477042000-04-06 18:18:10 +00005023 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00005024 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005025 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005026 continue;
5027 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005028 /* All other characters are considered unencodable */
5029 collstart = p;
5030 collend = p+1;
5031 while (collend < end) {
5032 if ((0 < *collend && *collend < 256) ||
5033 !Py_UNICODE_ISSPACE(*collend) ||
5034 Py_UNICODE_TODECIMAL(*collend))
5035 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005036 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005037 /* cache callback name lookup
5038 * (if not done yet, i.e. it's the first error) */
5039 if (known_errorHandler==-1) {
5040 if ((errors==NULL) || (!strcmp(errors, "strict")))
5041 known_errorHandler = 1;
5042 else if (!strcmp(errors, "replace"))
5043 known_errorHandler = 2;
5044 else if (!strcmp(errors, "ignore"))
5045 known_errorHandler = 3;
5046 else if (!strcmp(errors, "xmlcharrefreplace"))
5047 known_errorHandler = 4;
5048 else
5049 known_errorHandler = 0;
5050 }
5051 switch (known_errorHandler) {
5052 case 1: /* strict */
5053 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5054 goto onError;
5055 case 2: /* replace */
5056 for (p = collstart; p < collend; ++p)
5057 *output++ = '?';
5058 /* fall through */
5059 case 3: /* ignore */
5060 p = collend;
5061 break;
5062 case 4: /* xmlcharrefreplace */
5063 /* generate replacement (temporarily (mis)uses p) */
5064 for (p = collstart; p < collend; ++p)
5065 output += sprintf(output, "&#%d;", (int)*p);
5066 p = collend;
5067 break;
5068 default:
5069 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5070 encoding, reason, s, length, &exc,
5071 collstart-s, collend-s, &newpos);
5072 if (repunicode == NULL)
5073 goto onError;
5074 /* generate replacement */
5075 repsize = PyUnicode_GET_SIZE(repunicode);
5076 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5077 Py_UNICODE ch = *uni2;
5078 if (Py_UNICODE_ISSPACE(ch))
5079 *output++ = ' ';
5080 else {
5081 decimal = Py_UNICODE_TODECIMAL(ch);
5082 if (decimal >= 0)
5083 *output++ = '0' + decimal;
5084 else if (0 < ch && ch < 256)
5085 *output++ = (char)ch;
5086 else {
5087 Py_DECREF(repunicode);
5088 raise_encode_exception(&exc, encoding,
5089 s, length, collstart-s, collend-s, reason);
5090 goto onError;
5091 }
5092 }
5093 }
5094 p = s + newpos;
5095 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005096 }
5097 }
5098 /* 0-terminate the output string */
5099 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005100 Py_XDECREF(exc);
5101 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005102 return 0;
5103
5104 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005105 Py_XDECREF(exc);
5106 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005107 return -1;
5108}
5109
Guido van Rossumd57fd912000-03-10 22:53:23 +00005110/* --- Helpers ------------------------------------------------------------ */
5111
Eric Smitha9f7d622008-02-17 19:46:49 +00005112#include "stringlib/unicodedefs.h"
Fredrik Lundh6471ee42006-05-24 14:28:11 +00005113
Facundo Batista6f7e6fb2007-11-16 19:16:15 +00005114#define FROM_UNICODE
Fredrik Lundhb9479482006-05-26 17:22:38 +00005115
Fredrik Lundha50d2012006-05-26 17:04:58 +00005116#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005117
5118#include "stringlib/count.h"
5119#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005120#include "stringlib/partition.h"
5121
Fredrik Lundhc8162812006-05-26 19:33:03 +00005122/* helper macro to fixup start/end slice values */
5123#define FIX_START_END(obj) \
5124 if (start < 0) \
5125 start += (obj)->length; \
5126 if (start < 0) \
5127 start = 0; \
5128 if (end > (obj)->length) \
5129 end = (obj)->length; \
5130 if (end < 0) \
5131 end += (obj)->length; \
5132 if (end < 0) \
5133 end = 0;
5134
Martin v. Löwis18e16552006-02-15 17:27:45 +00005135Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005136 PyObject *substr,
5137 Py_ssize_t start,
5138 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005139{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005140 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005141 PyUnicodeObject* str_obj;
5142 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005143
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005144 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5145 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005146 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005147 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5148 if (!sub_obj) {
5149 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005150 return -1;
5151 }
Tim Petersced69f82003-09-16 20:30:58 +00005152
Fredrik Lundhc8162812006-05-26 19:33:03 +00005153 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005154
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005155 result = stringlib_count(
5156 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5157 );
5158
5159 Py_DECREF(sub_obj);
5160 Py_DECREF(str_obj);
5161
Guido van Rossumd57fd912000-03-10 22:53:23 +00005162 return result;
5163}
5164
Martin v. Löwis18e16552006-02-15 17:27:45 +00005165Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005166 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005167 Py_ssize_t start,
5168 Py_ssize_t end,
5169 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005170{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005171 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005172
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005173 str = PyUnicode_FromObject(str);
5174 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005175 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005176 sub = PyUnicode_FromObject(sub);
5177 if (!sub) {
5178 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005179 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005180 }
Tim Petersced69f82003-09-16 20:30:58 +00005181
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005182 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005183 result = stringlib_find_slice(
5184 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5185 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5186 start, end
5187 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005188 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005189 result = stringlib_rfind_slice(
5190 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5191 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5192 start, end
5193 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005194
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005195 Py_DECREF(str);
5196 Py_DECREF(sub);
5197
Guido van Rossumd57fd912000-03-10 22:53:23 +00005198 return result;
5199}
5200
Tim Petersced69f82003-09-16 20:30:58 +00005201static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005202int tailmatch(PyUnicodeObject *self,
5203 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005204 Py_ssize_t start,
5205 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005206 int direction)
5207{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005208 if (substring->length == 0)
5209 return 1;
5210
Fredrik Lundhc8162812006-05-26 19:33:03 +00005211 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005212
5213 end -= substring->length;
5214 if (end < start)
5215 return 0;
5216
5217 if (direction > 0) {
5218 if (Py_UNICODE_MATCH(self, end, substring))
5219 return 1;
5220 } else {
5221 if (Py_UNICODE_MATCH(self, start, substring))
5222 return 1;
5223 }
5224
5225 return 0;
5226}
5227
Martin v. Löwis18e16552006-02-15 17:27:45 +00005228Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005229 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005230 Py_ssize_t start,
5231 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005232 int direction)
5233{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005234 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005235
Guido van Rossumd57fd912000-03-10 22:53:23 +00005236 str = PyUnicode_FromObject(str);
5237 if (str == NULL)
5238 return -1;
5239 substr = PyUnicode_FromObject(substr);
5240 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005241 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005242 return -1;
5243 }
Tim Petersced69f82003-09-16 20:30:58 +00005244
Guido van Rossumd57fd912000-03-10 22:53:23 +00005245 result = tailmatch((PyUnicodeObject *)str,
5246 (PyUnicodeObject *)substr,
5247 start, end, direction);
5248 Py_DECREF(str);
5249 Py_DECREF(substr);
5250 return result;
5251}
5252
Guido van Rossumd57fd912000-03-10 22:53:23 +00005253/* Apply fixfct filter to the Unicode object self and return a
5254 reference to the modified object */
5255
Tim Petersced69f82003-09-16 20:30:58 +00005256static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005257PyObject *fixup(PyUnicodeObject *self,
5258 int (*fixfct)(PyUnicodeObject *s))
5259{
5260
5261 PyUnicodeObject *u;
5262
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005263 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005264 if (u == NULL)
5265 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005266
5267 Py_UNICODE_COPY(u->str, self->str, self->length);
5268
Tim Peters7a29bd52001-09-12 03:03:31 +00005269 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005270 /* fixfct should return TRUE if it modified the buffer. If
5271 FALSE, return a reference to the original buffer instead
5272 (to save space, not time) */
5273 Py_INCREF(self);
5274 Py_DECREF(u);
5275 return (PyObject*) self;
5276 }
5277 return (PyObject*) u;
5278}
5279
Tim Petersced69f82003-09-16 20:30:58 +00005280static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005281int fixupper(PyUnicodeObject *self)
5282{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005283 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005284 Py_UNICODE *s = self->str;
5285 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005286
Guido van Rossumd57fd912000-03-10 22:53:23 +00005287 while (len-- > 0) {
5288 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005289
Guido van Rossumd57fd912000-03-10 22:53:23 +00005290 ch = Py_UNICODE_TOUPPER(*s);
5291 if (ch != *s) {
5292 status = 1;
5293 *s = ch;
5294 }
5295 s++;
5296 }
5297
5298 return status;
5299}
5300
Tim Petersced69f82003-09-16 20:30:58 +00005301static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005302int fixlower(PyUnicodeObject *self)
5303{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005304 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005305 Py_UNICODE *s = self->str;
5306 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005307
Guido van Rossumd57fd912000-03-10 22:53:23 +00005308 while (len-- > 0) {
5309 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005310
Guido van Rossumd57fd912000-03-10 22:53:23 +00005311 ch = Py_UNICODE_TOLOWER(*s);
5312 if (ch != *s) {
5313 status = 1;
5314 *s = ch;
5315 }
5316 s++;
5317 }
5318
5319 return status;
5320}
5321
Tim Petersced69f82003-09-16 20:30:58 +00005322static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005323int fixswapcase(PyUnicodeObject *self)
5324{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005325 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005326 Py_UNICODE *s = self->str;
5327 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005328
Guido van Rossumd57fd912000-03-10 22:53:23 +00005329 while (len-- > 0) {
5330 if (Py_UNICODE_ISUPPER(*s)) {
5331 *s = Py_UNICODE_TOLOWER(*s);
5332 status = 1;
5333 } else if (Py_UNICODE_ISLOWER(*s)) {
5334 *s = Py_UNICODE_TOUPPER(*s);
5335 status = 1;
5336 }
5337 s++;
5338 }
5339
5340 return status;
5341}
5342
Tim Petersced69f82003-09-16 20:30:58 +00005343static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005344int fixcapitalize(PyUnicodeObject *self)
5345{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005346 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005347 Py_UNICODE *s = self->str;
5348 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005349
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005350 if (len == 0)
5351 return 0;
5352 if (Py_UNICODE_ISLOWER(*s)) {
5353 *s = Py_UNICODE_TOUPPER(*s);
5354 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005355 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005356 s++;
5357 while (--len > 0) {
5358 if (Py_UNICODE_ISUPPER(*s)) {
5359 *s = Py_UNICODE_TOLOWER(*s);
5360 status = 1;
5361 }
5362 s++;
5363 }
5364 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005365}
5366
5367static
5368int fixtitle(PyUnicodeObject *self)
5369{
5370 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5371 register Py_UNICODE *e;
5372 int previous_is_cased;
5373
5374 /* Shortcut for single character strings */
5375 if (PyUnicode_GET_SIZE(self) == 1) {
5376 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5377 if (*p != ch) {
5378 *p = ch;
5379 return 1;
5380 }
5381 else
5382 return 0;
5383 }
Tim Petersced69f82003-09-16 20:30:58 +00005384
Guido van Rossumd57fd912000-03-10 22:53:23 +00005385 e = p + PyUnicode_GET_SIZE(self);
5386 previous_is_cased = 0;
5387 for (; p < e; p++) {
5388 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005389
Guido van Rossumd57fd912000-03-10 22:53:23 +00005390 if (previous_is_cased)
5391 *p = Py_UNICODE_TOLOWER(ch);
5392 else
5393 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005394
5395 if (Py_UNICODE_ISLOWER(ch) ||
5396 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005397 Py_UNICODE_ISTITLE(ch))
5398 previous_is_cased = 1;
5399 else
5400 previous_is_cased = 0;
5401 }
5402 return 1;
5403}
5404
Tim Peters8ce9f162004-08-27 01:49:32 +00005405PyObject *
5406PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005407{
Tim Peters8ce9f162004-08-27 01:49:32 +00005408 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005409 const Py_UNICODE blank = ' ';
5410 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00005411 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005412 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00005413 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5414 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005415 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5416 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005417 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005418 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005419 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005420
Tim Peters05eba1f2004-08-27 21:32:02 +00005421 fseq = PySequence_Fast(seq, "");
5422 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005423 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005424 }
5425
Tim Peters91879ab2004-08-27 22:35:44 +00005426 /* Grrrr. A codec may be invoked to convert str objects to
5427 * Unicode, and so it's possible to call back into Python code
5428 * during PyUnicode_FromObject(), and so it's possible for a sick
5429 * codec to change the size of fseq (if seq is a list). Therefore
5430 * we have to keep refetching the size -- can't assume seqlen
5431 * is invariant.
5432 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005433 seqlen = PySequence_Fast_GET_SIZE(fseq);
5434 /* If empty sequence, return u"". */
5435 if (seqlen == 0) {
5436 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5437 goto Done;
5438 }
5439 /* If singleton sequence with an exact Unicode, return that. */
5440 if (seqlen == 1) {
5441 item = PySequence_Fast_GET_ITEM(fseq, 0);
5442 if (PyUnicode_CheckExact(item)) {
5443 Py_INCREF(item);
5444 res = (PyUnicodeObject *)item;
5445 goto Done;
5446 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005447 }
5448
Tim Peters05eba1f2004-08-27 21:32:02 +00005449 /* At least two items to join, or one that isn't exact Unicode. */
5450 if (seqlen > 1) {
5451 /* Set up sep and seplen -- they're needed. */
5452 if (separator == NULL) {
5453 sep = &blank;
5454 seplen = 1;
5455 }
5456 else {
5457 internal_separator = PyUnicode_FromObject(separator);
5458 if (internal_separator == NULL)
5459 goto onError;
5460 sep = PyUnicode_AS_UNICODE(internal_separator);
5461 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00005462 /* In case PyUnicode_FromObject() mutated seq. */
5463 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005464 }
5465 }
5466
5467 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005468 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005469 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005470 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005471 res_p = PyUnicode_AS_UNICODE(res);
5472 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005473
Tim Peters05eba1f2004-08-27 21:32:02 +00005474 for (i = 0; i < seqlen; ++i) {
Tim Peters286085c2006-05-22 19:17:04 +00005475 Py_ssize_t itemlen;
5476 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005477
5478 item = PySequence_Fast_GET_ITEM(fseq, i);
5479 /* Convert item to Unicode. */
5480 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5481 PyErr_Format(PyExc_TypeError,
Thomas Wouters715a4cd2006-04-16 22:04:49 +00005482 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00005483 " %.80s found",
Christian Heimese93237d2007-12-19 02:37:44 +00005484 i, Py_TYPE(item)->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00005485 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005486 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005487 item = PyUnicode_FromObject(item);
5488 if (item == NULL)
5489 goto onError;
5490 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005491
Tim Peters91879ab2004-08-27 22:35:44 +00005492 /* In case PyUnicode_FromObject() mutated seq. */
5493 seqlen = PySequence_Fast_GET_SIZE(fseq);
5494
Tim Peters8ce9f162004-08-27 01:49:32 +00005495 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005496 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005497 new_res_used = res_used + itemlen;
Georg Brandl90e27d32006-06-10 06:40:50 +00005498 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005499 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005500 if (i < seqlen - 1) {
5501 new_res_used += seplen;
Georg Brandl90e27d32006-06-10 06:40:50 +00005502 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00005503 goto Overflow;
5504 }
5505 if (new_res_used > res_alloc) {
5506 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00005507 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00005508 res_alloc += res_alloc;
Tim Peters286085c2006-05-22 19:17:04 +00005509 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005510 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005511 } while (new_res_used > res_alloc);
Martin v. Löwis412fb672006-04-13 06:34:32 +00005512 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005513 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005514 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005515 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005516 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005517 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005518
5519 /* Copy item, and maybe the separator. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005520 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005521 res_p += itemlen;
5522 if (i < seqlen - 1) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00005523 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005524 res_p += seplen;
5525 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005526 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005527 res_used = new_res_used;
5528 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005529
Tim Peters05eba1f2004-08-27 21:32:02 +00005530 /* Shrink res to match the used area; this probably can't fail,
5531 * but it's cheap to check.
5532 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005533 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005534 goto onError;
5535
5536 Done:
5537 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005538 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005539 return (PyObject *)res;
5540
Tim Peters8ce9f162004-08-27 01:49:32 +00005541 Overflow:
5542 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00005543 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005544 Py_DECREF(item);
5545 /* fall through */
5546
Guido van Rossumd57fd912000-03-10 22:53:23 +00005547 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005548 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005549 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005550 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005551 return NULL;
5552}
5553
Tim Petersced69f82003-09-16 20:30:58 +00005554static
5555PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005556 Py_ssize_t left,
5557 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005558 Py_UNICODE fill)
5559{
5560 PyUnicodeObject *u;
5561
5562 if (left < 0)
5563 left = 0;
5564 if (right < 0)
5565 right = 0;
5566
Tim Peters7a29bd52001-09-12 03:03:31 +00005567 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005568 Py_INCREF(self);
5569 return self;
5570 }
5571
5572 u = _PyUnicode_New(left + self->length + right);
5573 if (u) {
5574 if (left)
5575 Py_UNICODE_FILL(u->str, fill, left);
5576 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5577 if (right)
5578 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5579 }
5580
5581 return u;
5582}
5583
5584#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005585 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005586 if (!str) \
5587 goto onError; \
5588 if (PyList_Append(list, str)) { \
5589 Py_DECREF(str); \
5590 goto onError; \
5591 } \
5592 else \
5593 Py_DECREF(str);
5594
5595static
5596PyObject *split_whitespace(PyUnicodeObject *self,
5597 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005598 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005599{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005600 register Py_ssize_t i;
5601 register Py_ssize_t j;
5602 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005603 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005604 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005605
5606 for (i = j = 0; i < len; ) {
5607 /* find a token */
Christian Heimes4d4f2702008-01-30 11:32:37 +00005608 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005609 i++;
5610 j = i;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005611 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005612 i++;
5613 if (j < i) {
5614 if (maxcount-- <= 0)
5615 break;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005616 SPLIT_APPEND(buf, j, i);
5617 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005618 i++;
5619 j = i;
5620 }
5621 }
5622 if (j < len) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005623 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005624 }
5625 return list;
5626
5627 onError:
5628 Py_DECREF(list);
5629 return NULL;
5630}
5631
5632PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005633 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005634{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005635 register Py_ssize_t i;
5636 register Py_ssize_t j;
5637 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005638 PyObject *list;
5639 PyObject *str;
5640 Py_UNICODE *data;
5641
5642 string = PyUnicode_FromObject(string);
5643 if (string == NULL)
5644 return NULL;
5645 data = PyUnicode_AS_UNICODE(string);
5646 len = PyUnicode_GET_SIZE(string);
5647
Guido van Rossumd57fd912000-03-10 22:53:23 +00005648 list = PyList_New(0);
5649 if (!list)
5650 goto onError;
5651
5652 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005653 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005654
Guido van Rossumd57fd912000-03-10 22:53:23 +00005655 /* Find a line and append it */
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005656 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005657 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005658
5659 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005660 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005661 if (i < len) {
5662 if (data[i] == '\r' && i + 1 < len &&
5663 data[i+1] == '\n')
5664 i += 2;
5665 else
5666 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005667 if (keepends)
5668 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005669 }
Guido van Rossum86662912000-04-11 15:38:46 +00005670 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005671 j = i;
5672 }
5673 if (j < len) {
5674 SPLIT_APPEND(data, j, len);
5675 }
5676
5677 Py_DECREF(string);
5678 return list;
5679
5680 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005681 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005682 Py_DECREF(string);
5683 return NULL;
5684}
5685
Tim Petersced69f82003-09-16 20:30:58 +00005686static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005687PyObject *split_char(PyUnicodeObject *self,
5688 PyObject *list,
5689 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005690 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005691{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005692 register Py_ssize_t i;
5693 register Py_ssize_t j;
5694 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005695 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005696 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005697
5698 for (i = j = 0; i < len; ) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005699 if (buf[i] == ch) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005700 if (maxcount-- <= 0)
5701 break;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005702 SPLIT_APPEND(buf, j, i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005703 i = j = i + 1;
5704 } else
5705 i++;
5706 }
5707 if (j <= len) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005708 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005709 }
5710 return list;
5711
5712 onError:
5713 Py_DECREF(list);
5714 return NULL;
5715}
5716
Tim Petersced69f82003-09-16 20:30:58 +00005717static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005718PyObject *split_substring(PyUnicodeObject *self,
5719 PyObject *list,
5720 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005721 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005722{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005723 register Py_ssize_t i;
5724 register Py_ssize_t j;
5725 Py_ssize_t len = self->length;
5726 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005727 PyObject *str;
5728
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005729 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005730 if (Py_UNICODE_MATCH(self, i, substring)) {
5731 if (maxcount-- <= 0)
5732 break;
5733 SPLIT_APPEND(self->str, j, i);
5734 i = j = i + sublen;
5735 } else
5736 i++;
5737 }
5738 if (j <= len) {
5739 SPLIT_APPEND(self->str, j, len);
5740 }
5741 return list;
5742
5743 onError:
5744 Py_DECREF(list);
5745 return NULL;
5746}
5747
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005748static
5749PyObject *rsplit_whitespace(PyUnicodeObject *self,
5750 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005751 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005752{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005753 register Py_ssize_t i;
5754 register Py_ssize_t j;
5755 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005756 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005757 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005758
5759 for (i = j = len - 1; i >= 0; ) {
5760 /* find a token */
Christian Heimes4d4f2702008-01-30 11:32:37 +00005761 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005762 i--;
5763 j = i;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005764 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005765 i--;
5766 if (j > i) {
5767 if (maxcount-- <= 0)
5768 break;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005769 SPLIT_APPEND(buf, i + 1, j + 1);
5770 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005771 i--;
5772 j = i;
5773 }
5774 }
5775 if (j >= 0) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005776 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005777 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005778 if (PyList_Reverse(list) < 0)
5779 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005780 return list;
5781
5782 onError:
5783 Py_DECREF(list);
5784 return NULL;
5785}
5786
5787static
5788PyObject *rsplit_char(PyUnicodeObject *self,
5789 PyObject *list,
5790 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005791 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005792{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005793 register Py_ssize_t i;
5794 register Py_ssize_t j;
5795 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005796 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005797 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005798
5799 for (i = j = len - 1; i >= 0; ) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005800 if (buf[i] == ch) {
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005801 if (maxcount-- <= 0)
5802 break;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005803 SPLIT_APPEND(buf, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005804 j = i = i - 1;
5805 } else
5806 i--;
5807 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005808 if (j >= -1) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005809 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005810 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005811 if (PyList_Reverse(list) < 0)
5812 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005813 return list;
5814
5815 onError:
5816 Py_DECREF(list);
5817 return NULL;
5818}
5819
5820static
5821PyObject *rsplit_substring(PyUnicodeObject *self,
5822 PyObject *list,
5823 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005824 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005825{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005826 register Py_ssize_t i;
5827 register Py_ssize_t j;
5828 Py_ssize_t len = self->length;
5829 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005830 PyObject *str;
5831
5832 for (i = len - sublen, j = len; i >= 0; ) {
5833 if (Py_UNICODE_MATCH(self, i, substring)) {
5834 if (maxcount-- <= 0)
5835 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005836 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005837 j = i;
5838 i -= sublen;
5839 } else
5840 i--;
5841 }
5842 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005843 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005844 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005845 if (PyList_Reverse(list) < 0)
5846 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005847 return list;
5848
5849 onError:
5850 Py_DECREF(list);
5851 return NULL;
5852}
5853
Guido van Rossumd57fd912000-03-10 22:53:23 +00005854#undef SPLIT_APPEND
5855
5856static
5857PyObject *split(PyUnicodeObject *self,
5858 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005859 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005860{
5861 PyObject *list;
5862
5863 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005864 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005865
5866 list = PyList_New(0);
5867 if (!list)
5868 return NULL;
5869
5870 if (substring == NULL)
5871 return split_whitespace(self,list,maxcount);
5872
5873 else if (substring->length == 1)
5874 return split_char(self,list,substring->str[0],maxcount);
5875
5876 else if (substring->length == 0) {
5877 Py_DECREF(list);
5878 PyErr_SetString(PyExc_ValueError, "empty separator");
5879 return NULL;
5880 }
5881 else
5882 return split_substring(self,list,substring,maxcount);
5883}
5884
Tim Petersced69f82003-09-16 20:30:58 +00005885static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005886PyObject *rsplit(PyUnicodeObject *self,
5887 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005888 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005889{
5890 PyObject *list;
5891
5892 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005893 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005894
5895 list = PyList_New(0);
5896 if (!list)
5897 return NULL;
5898
5899 if (substring == NULL)
5900 return rsplit_whitespace(self,list,maxcount);
5901
5902 else if (substring->length == 1)
5903 return rsplit_char(self,list,substring->str[0],maxcount);
5904
5905 else if (substring->length == 0) {
5906 Py_DECREF(list);
5907 PyErr_SetString(PyExc_ValueError, "empty separator");
5908 return NULL;
5909 }
5910 else
5911 return rsplit_substring(self,list,substring,maxcount);
5912}
5913
5914static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005915PyObject *replace(PyUnicodeObject *self,
5916 PyUnicodeObject *str1,
5917 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005918 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005919{
5920 PyUnicodeObject *u;
5921
5922 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005923 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005924
Fredrik Lundh347ee272006-05-24 16:35:18 +00005925 if (str1->length == str2->length) {
5926 /* same length */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005927 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005928 if (str1->length == 1) {
5929 /* replace characters */
5930 Py_UNICODE u1, u2;
5931 if (!findchar(self->str, self->length, str1->str[0]))
5932 goto nothing;
5933 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5934 if (!u)
5935 return NULL;
5936 Py_UNICODE_COPY(u->str, self->str, self->length);
5937 u1 = str1->str[0];
5938 u2 = str2->str[0];
5939 for (i = 0; i < u->length; i++)
5940 if (u->str[i] == u1) {
5941 if (--maxcount < 0)
5942 break;
5943 u->str[i] = u2;
5944 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005945 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005946 i = fastsearch(
5947 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005948 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00005949 if (i < 0)
5950 goto nothing;
5951 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5952 if (!u)
5953 return NULL;
5954 Py_UNICODE_COPY(u->str, self->str, self->length);
5955 while (i <= self->length - str1->length)
5956 if (Py_UNICODE_MATCH(self, i, str1)) {
5957 if (--maxcount < 0)
5958 break;
5959 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5960 i += str1->length;
5961 } else
5962 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005963 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005964 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005965
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005966 Py_ssize_t n, i, j, e;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005967 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005968 Py_UNICODE *p;
5969
5970 /* replace strings */
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005971 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005972 if (n > maxcount)
5973 n = maxcount;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005974 if (n == 0)
5975 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005976 /* new_size = self->length + n * (str2->length - str1->length)); */
5977 delta = (str2->length - str1->length);
5978 if (delta == 0) {
5979 new_size = self->length;
5980 } else {
5981 product = n * (str2->length - str1->length);
5982 if ((product / (str2->length - str1->length)) != n) {
5983 PyErr_SetString(PyExc_OverflowError,
5984 "replace string is too long");
5985 return NULL;
5986 }
5987 new_size = self->length + product;
5988 if (new_size < 0) {
5989 PyErr_SetString(PyExc_OverflowError,
5990 "replace string is too long");
5991 return NULL;
5992 }
5993 }
5994 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005995 if (!u)
5996 return NULL;
5997 i = 0;
5998 p = u->str;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005999 e = self->length - str1->length;
Fredrik Lundh347ee272006-05-24 16:35:18 +00006000 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006001 while (n-- > 0) {
6002 /* look for next match */
6003 j = i;
6004 while (j <= e) {
6005 if (Py_UNICODE_MATCH(self, j, str1))
6006 break;
6007 j++;
6008 }
6009 if (j > i) {
6010 if (j > e)
6011 break;
6012 /* copy unchanged part [i:j] */
6013 Py_UNICODE_COPY(p, self->str+i, j-i);
6014 p += j - i;
6015 }
6016 /* copy substitution string */
6017 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00006018 Py_UNICODE_COPY(p, str2->str, str2->length);
6019 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006020 }
6021 i = j + str1->length;
6022 }
6023 if (i < self->length)
6024 /* copy tail [i:] */
6025 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00006026 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006027 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00006028 while (n > 0) {
6029 Py_UNICODE_COPY(p, str2->str, str2->length);
6030 p += str2->length;
6031 if (--n <= 0)
6032 break;
6033 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006034 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00006035 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006036 }
6037 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006038 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00006039
6040nothing:
6041 /* nothing to replace; return original string (when possible) */
6042 if (PyUnicode_CheckExact(self)) {
6043 Py_INCREF(self);
6044 return (PyObject *) self;
6045 }
6046 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006047}
6048
6049/* --- Unicode Object Methods --------------------------------------------- */
6050
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006051PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006052"S.title() -> unicode\n\
6053\n\
6054Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006055characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006056
6057static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006058unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006059{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006060 return fixup(self, fixtitle);
6061}
6062
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006063PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006064"S.capitalize() -> unicode\n\
6065\n\
6066Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006067have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006068
6069static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006070unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006071{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006072 return fixup(self, fixcapitalize);
6073}
6074
6075#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006076PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006077"S.capwords() -> unicode\n\
6078\n\
6079Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006080normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006081
6082static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006083unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006084{
6085 PyObject *list;
6086 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006087 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006088
Guido van Rossumd57fd912000-03-10 22:53:23 +00006089 /* Split into words */
6090 list = split(self, NULL, -1);
6091 if (!list)
6092 return NULL;
6093
6094 /* Capitalize each word */
6095 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6096 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6097 fixcapitalize);
6098 if (item == NULL)
6099 goto onError;
6100 Py_DECREF(PyList_GET_ITEM(list, i));
6101 PyList_SET_ITEM(list, i, item);
6102 }
6103
6104 /* Join the words to form a new string */
6105 item = PyUnicode_Join(NULL, list);
6106
6107onError:
6108 Py_DECREF(list);
6109 return (PyObject *)item;
6110}
6111#endif
6112
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006113/* Argument converter. Coerces to a single unicode character */
6114
6115static int
6116convert_uc(PyObject *obj, void *addr)
6117{
6118 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6119 PyObject *uniobj;
6120 Py_UNICODE *unistr;
6121
6122 uniobj = PyUnicode_FromObject(obj);
6123 if (uniobj == NULL) {
6124 PyErr_SetString(PyExc_TypeError,
6125 "The fill character cannot be converted to Unicode");
6126 return 0;
6127 }
6128 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6129 PyErr_SetString(PyExc_TypeError,
6130 "The fill character must be exactly one character long");
6131 Py_DECREF(uniobj);
6132 return 0;
6133 }
6134 unistr = PyUnicode_AS_UNICODE(uniobj);
6135 *fillcharloc = unistr[0];
6136 Py_DECREF(uniobj);
6137 return 1;
6138}
6139
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006140PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006141"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006142\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006143Return S centered in a Unicode string of length width. Padding is\n\
6144done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006145
6146static PyObject *
6147unicode_center(PyUnicodeObject *self, PyObject *args)
6148{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006149 Py_ssize_t marg, left;
6150 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006151 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006152
Thomas Woutersde017742006-02-16 19:34:37 +00006153 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006154 return NULL;
6155
Tim Peters7a29bd52001-09-12 03:03:31 +00006156 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006157 Py_INCREF(self);
6158 return (PyObject*) self;
6159 }
6160
6161 marg = width - self->length;
6162 left = marg / 2 + (marg & width & 1);
6163
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006164 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006165}
6166
Marc-André Lemburge5034372000-08-08 08:04:29 +00006167#if 0
6168
6169/* This code should go into some future Unicode collation support
6170 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006171 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006172
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006173/* speedy UTF-16 code point order comparison */
6174/* gleaned from: */
6175/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6176
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006177static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006178{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006179 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006180 0, 0, 0, 0, 0, 0, 0, 0,
6181 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006182 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006183};
6184
Guido van Rossumd57fd912000-03-10 22:53:23 +00006185static int
6186unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6187{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006188 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006189
Guido van Rossumd57fd912000-03-10 22:53:23 +00006190 Py_UNICODE *s1 = str1->str;
6191 Py_UNICODE *s2 = str2->str;
6192
6193 len1 = str1->length;
6194 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006195
Guido van Rossumd57fd912000-03-10 22:53:23 +00006196 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006197 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006198
6199 c1 = *s1++;
6200 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006201
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006202 if (c1 > (1<<11) * 26)
6203 c1 += utf16Fixup[c1>>11];
6204 if (c2 > (1<<11) * 26)
6205 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006206 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006207
6208 if (c1 != c2)
6209 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006210
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006211 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006212 }
6213
6214 return (len1 < len2) ? -1 : (len1 != len2);
6215}
6216
Marc-André Lemburge5034372000-08-08 08:04:29 +00006217#else
6218
6219static int
6220unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6221{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006222 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006223
6224 Py_UNICODE *s1 = str1->str;
6225 Py_UNICODE *s2 = str2->str;
6226
6227 len1 = str1->length;
6228 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006229
Marc-André Lemburge5034372000-08-08 08:04:29 +00006230 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006231 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006232
Fredrik Lundh45714e92001-06-26 16:39:36 +00006233 c1 = *s1++;
6234 c2 = *s2++;
6235
6236 if (c1 != c2)
6237 return (c1 < c2) ? -1 : 1;
6238
Marc-André Lemburge5034372000-08-08 08:04:29 +00006239 len1--; len2--;
6240 }
6241
6242 return (len1 < len2) ? -1 : (len1 != len2);
6243}
6244
6245#endif
6246
Guido van Rossumd57fd912000-03-10 22:53:23 +00006247int PyUnicode_Compare(PyObject *left,
6248 PyObject *right)
6249{
6250 PyUnicodeObject *u = NULL, *v = NULL;
6251 int result;
6252
6253 /* Coerce the two arguments */
6254 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6255 if (u == NULL)
6256 goto onError;
6257 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6258 if (v == NULL)
6259 goto onError;
6260
Thomas Wouters7e474022000-07-16 12:04:32 +00006261 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006262 if (v == u) {
6263 Py_DECREF(u);
6264 Py_DECREF(v);
6265 return 0;
6266 }
6267
6268 result = unicode_compare(u, v);
6269
6270 Py_DECREF(u);
6271 Py_DECREF(v);
6272 return result;
6273
6274onError:
6275 Py_XDECREF(u);
6276 Py_XDECREF(v);
6277 return -1;
6278}
6279
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006280PyObject *PyUnicode_RichCompare(PyObject *left,
6281 PyObject *right,
6282 int op)
6283{
6284 int result;
6285
6286 result = PyUnicode_Compare(left, right);
6287 if (result == -1 && PyErr_Occurred())
6288 goto onError;
6289
6290 /* Convert the return value to a Boolean */
6291 switch (op) {
6292 case Py_EQ:
6293 result = (result == 0);
6294 break;
6295 case Py_NE:
6296 result = (result != 0);
6297 break;
6298 case Py_LE:
6299 result = (result <= 0);
6300 break;
6301 case Py_GE:
6302 result = (result >= 0);
6303 break;
6304 case Py_LT:
6305 result = (result == -1);
6306 break;
6307 case Py_GT:
6308 result = (result == 1);
6309 break;
6310 }
6311 return PyBool_FromLong(result);
6312
6313 onError:
6314
6315 /* Standard case
6316
6317 Type errors mean that PyUnicode_FromObject() could not convert
6318 one of the arguments (usually the right hand side) to Unicode,
6319 ie. we can't handle the comparison request. However, it is
6320 possible that the other object knows a comparison method, which
6321 is why we return Py_NotImplemented to give the other object a
6322 chance.
6323
6324 */
6325 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6326 PyErr_Clear();
6327 Py_INCREF(Py_NotImplemented);
6328 return Py_NotImplemented;
6329 }
6330 if (op != Py_EQ && op != Py_NE)
6331 return NULL;
6332
6333 /* Equality comparison.
6334
6335 This is a special case: we silence any PyExc_UnicodeDecodeError
6336 and instead turn it into a PyErr_UnicodeWarning.
6337
6338 */
6339 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6340 return NULL;
6341 PyErr_Clear();
6342 if (PyErr_Warn(PyExc_UnicodeWarning,
6343 (op == Py_EQ) ?
6344 "Unicode equal comparison "
6345 "failed to convert both arguments to Unicode - "
6346 "interpreting them as being unequal" :
6347 "Unicode unequal comparison "
6348 "failed to convert both arguments to Unicode - "
6349 "interpreting them as being unequal"
6350 ) < 0)
6351 return NULL;
6352 result = (op == Py_NE);
6353 return PyBool_FromLong(result);
6354}
6355
Guido van Rossum403d68b2000-03-13 15:55:09 +00006356int PyUnicode_Contains(PyObject *container,
6357 PyObject *element)
6358{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006359 PyObject *str, *sub;
6360 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006361
6362 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006363 sub = PyUnicode_FromObject(element);
6364 if (!sub) {
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006365 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00006366 "'in <string>' requires string as left operand");
Fredrik Lundh833bf942006-05-23 10:12:21 +00006367 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006368 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00006369
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006370 str = PyUnicode_FromObject(container);
6371 if (!str) {
6372 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00006373 return -1;
6374 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00006375
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006376 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00006377
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006378 Py_DECREF(str);
6379 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00006380
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006381 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006382}
6383
Guido van Rossumd57fd912000-03-10 22:53:23 +00006384/* Concat to string or Unicode object giving a new Unicode object. */
6385
6386PyObject *PyUnicode_Concat(PyObject *left,
6387 PyObject *right)
6388{
6389 PyUnicodeObject *u = NULL, *v = NULL, *w;
6390
6391 /* Coerce the two arguments */
6392 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6393 if (u == NULL)
6394 goto onError;
6395 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6396 if (v == NULL)
6397 goto onError;
6398
6399 /* Shortcuts */
6400 if (v == unicode_empty) {
6401 Py_DECREF(v);
6402 return (PyObject *)u;
6403 }
6404 if (u == unicode_empty) {
6405 Py_DECREF(u);
6406 return (PyObject *)v;
6407 }
6408
6409 /* Concat the two Unicode strings */
6410 w = _PyUnicode_New(u->length + v->length);
6411 if (w == NULL)
6412 goto onError;
6413 Py_UNICODE_COPY(w->str, u->str, u->length);
6414 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6415
6416 Py_DECREF(u);
6417 Py_DECREF(v);
6418 return (PyObject *)w;
6419
6420onError:
6421 Py_XDECREF(u);
6422 Py_XDECREF(v);
6423 return NULL;
6424}
6425
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006426PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006427"S.count(sub[, start[, end]]) -> int\n\
6428\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00006429Return the number of non-overlapping occurrences of substring sub in\n\
6430Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006431interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006432
6433static PyObject *
6434unicode_count(PyUnicodeObject *self, PyObject *args)
6435{
6436 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006437 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006438 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006439 PyObject *result;
6440
Guido van Rossumb8872e62000-05-09 14:14:27 +00006441 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6442 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006443 return NULL;
6444
6445 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006446 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006447 if (substring == NULL)
6448 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006449
Fredrik Lundhc8162812006-05-26 19:33:03 +00006450 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006451
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006452 result = PyInt_FromSsize_t(
6453 stringlib_count(self->str + start, end - start,
6454 substring->str, substring->length)
6455 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006456
6457 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006458
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459 return result;
6460}
6461
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006462PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006463"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006464\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006465Encodes S using the codec registered for encoding. encoding defaults\n\
6466to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006467handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006468a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6469'xmlcharrefreplace' as well as any other name registered with\n\
6470codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006471
6472static PyObject *
6473unicode_encode(PyUnicodeObject *self, PyObject *args)
6474{
6475 char *encoding = NULL;
6476 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006477 PyObject *v;
6478
Guido van Rossumd57fd912000-03-10 22:53:23 +00006479 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6480 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006481 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006482 if (v == NULL)
6483 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006484 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6485 PyErr_Format(PyExc_TypeError,
6486 "encoder did not return a string/unicode object "
6487 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006488 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006489 Py_DECREF(v);
6490 return NULL;
6491 }
6492 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006493
6494 onError:
6495 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006496}
6497
6498PyDoc_STRVAR(decode__doc__,
6499"S.decode([encoding[,errors]]) -> string or unicode\n\
6500\n\
6501Decodes S using the codec registered for encoding. encoding defaults\n\
6502to the default encoding. errors may be given to set a different error\n\
6503handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6504a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6505as well as any other name registerd with codecs.register_error that is\n\
6506able to handle UnicodeDecodeErrors.");
6507
6508static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00006509unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006510{
6511 char *encoding = NULL;
6512 char *errors = NULL;
6513 PyObject *v;
6514
6515 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
6516 return NULL;
6517 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006518 if (v == NULL)
6519 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006520 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6521 PyErr_Format(PyExc_TypeError,
6522 "decoder did not return a string/unicode object "
6523 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006524 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006525 Py_DECREF(v);
6526 return NULL;
6527 }
6528 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006529
6530 onError:
6531 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006532}
6533
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006534PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006535"S.expandtabs([tabsize]) -> unicode\n\
6536\n\
6537Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006538If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006539
6540static PyObject*
6541unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6542{
6543 Py_UNICODE *e;
6544 Py_UNICODE *p;
6545 Py_UNICODE *q;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006546 Py_UNICODE *qe;
6547 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006548 PyUnicodeObject *u;
6549 int tabsize = 8;
6550
6551 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6552 return NULL;
6553
Thomas Wouters7e474022000-07-16 12:04:32 +00006554 /* First pass: determine size of output string */
Guido van Rossum5bdff602008-03-11 21:18:06 +00006555 i = 0; /* chars up to and including most recent \n or \r */
6556 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6557 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006558 for (p = self->str; p < e; p++)
6559 if (*p == '\t') {
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006560 if (tabsize > 0) {
Guido van Rossum5bdff602008-03-11 21:18:06 +00006561 incr = tabsize - (j % tabsize); /* cannot overflow */
6562 if (j > PY_SSIZE_T_MAX - incr)
6563 goto overflow1;
6564 j += incr;
6565 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006566 }
6567 else {
Guido van Rossum5bdff602008-03-11 21:18:06 +00006568 if (j > PY_SSIZE_T_MAX - 1)
6569 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006570 j++;
6571 if (*p == '\n' || *p == '\r') {
Guido van Rossum5bdff602008-03-11 21:18:06 +00006572 if (i > PY_SSIZE_T_MAX - j)
6573 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006574 i += j;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006575 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006576 }
6577 }
6578
Guido van Rossum5bdff602008-03-11 21:18:06 +00006579 if (i > PY_SSIZE_T_MAX - j)
6580 goto overflow1;
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006581
Guido van Rossumd57fd912000-03-10 22:53:23 +00006582 /* Second pass: create output string and fill it */
6583 u = _PyUnicode_New(i + j);
6584 if (!u)
6585 return NULL;
6586
Guido van Rossum5bdff602008-03-11 21:18:06 +00006587 j = 0; /* same as in first pass */
6588 q = u->str; /* next output char */
6589 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006590
6591 for (p = self->str; p < e; p++)
6592 if (*p == '\t') {
6593 if (tabsize > 0) {
6594 i = tabsize - (j % tabsize);
6595 j += i;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006596 while (i--) {
6597 if (q >= qe)
6598 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006599 *q++ = ' ';
Guido van Rossum5bdff602008-03-11 21:18:06 +00006600 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006601 }
6602 }
6603 else {
Guido van Rossum5bdff602008-03-11 21:18:06 +00006604 if (q >= qe)
6605 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006606 *q++ = *p;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006607 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006608 if (*p == '\n' || *p == '\r')
6609 j = 0;
6610 }
6611
6612 return (PyObject*) u;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006613
6614 overflow2:
6615 Py_DECREF(u);
6616 overflow1:
6617 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6618 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006619}
6620
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006621PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006622"S.find(sub [,start [,end]]) -> int\n\
6623\n\
6624Return the lowest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00006625such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006626arguments start and end are interpreted as in slice notation.\n\
6627\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006628Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006629
6630static PyObject *
6631unicode_find(PyUnicodeObject *self, PyObject *args)
6632{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006633 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006634 Py_ssize_t start;
6635 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006636 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006637
Facundo Batista57d56692007-11-16 18:04:14 +00006638 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006639 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006640
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006641 result = stringlib_find_slice(
6642 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6643 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6644 start, end
6645 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006646
6647 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006648
6649 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006650}
6651
6652static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006653unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006654{
6655 if (index < 0 || index >= self->length) {
6656 PyErr_SetString(PyExc_IndexError, "string index out of range");
6657 return NULL;
6658 }
6659
6660 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6661}
6662
6663static long
6664unicode_hash(PyUnicodeObject *self)
6665{
Fredrik Lundhdde61642000-07-10 18:27:47 +00006666 /* Since Unicode objects compare equal to their ASCII string
6667 counterparts, they should use the individual character values
6668 as basis for their hash value. This is needed to assure that
6669 strings and Unicode objects behave in the same way as
6670 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006671
Martin v. Löwis18e16552006-02-15 17:27:45 +00006672 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006673 register Py_UNICODE *p;
6674 register long x;
6675
Guido van Rossumd57fd912000-03-10 22:53:23 +00006676 if (self->hash != -1)
6677 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006678 len = PyUnicode_GET_SIZE(self);
6679 p = PyUnicode_AS_UNICODE(self);
6680 x = *p << 7;
6681 while (--len >= 0)
6682 x = (1000003*x) ^ *p++;
6683 x ^= PyUnicode_GET_SIZE(self);
6684 if (x == -1)
6685 x = -2;
6686 self->hash = x;
6687 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006688}
6689
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006690PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006691"S.index(sub [,start [,end]]) -> int\n\
6692\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006693Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006694
6695static PyObject *
6696unicode_index(PyUnicodeObject *self, PyObject *args)
6697{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006698 Py_ssize_t result;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006699 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006700 Py_ssize_t start;
6701 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006702
Facundo Batista57d56692007-11-16 18:04:14 +00006703 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006704 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006705
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006706 result = stringlib_find_slice(
6707 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6708 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6709 start, end
6710 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006711
6712 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006713
Guido van Rossumd57fd912000-03-10 22:53:23 +00006714 if (result < 0) {
6715 PyErr_SetString(PyExc_ValueError, "substring not found");
6716 return NULL;
6717 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006718
Martin v. Löwis18e16552006-02-15 17:27:45 +00006719 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006720}
6721
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006722PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006723"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006724\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006725Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006726at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006727
6728static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006729unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006730{
6731 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6732 register const Py_UNICODE *e;
6733 int cased;
6734
Guido van Rossumd57fd912000-03-10 22:53:23 +00006735 /* Shortcut for single character strings */
6736 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006737 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006738
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006739 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006740 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006741 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006742
Guido van Rossumd57fd912000-03-10 22:53:23 +00006743 e = p + PyUnicode_GET_SIZE(self);
6744 cased = 0;
6745 for (; p < e; p++) {
6746 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006747
Guido van Rossumd57fd912000-03-10 22:53:23 +00006748 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006749 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006750 else if (!cased && Py_UNICODE_ISLOWER(ch))
6751 cased = 1;
6752 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006753 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006754}
6755
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006756PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006757"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006758\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006759Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006760at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006761
6762static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006763unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006764{
6765 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6766 register const Py_UNICODE *e;
6767 int cased;
6768
Guido van Rossumd57fd912000-03-10 22:53:23 +00006769 /* Shortcut for single character strings */
6770 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006771 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006772
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006773 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006774 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006775 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006776
Guido van Rossumd57fd912000-03-10 22:53:23 +00006777 e = p + PyUnicode_GET_SIZE(self);
6778 cased = 0;
6779 for (; p < e; p++) {
6780 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006781
Guido van Rossumd57fd912000-03-10 22:53:23 +00006782 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006783 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006784 else if (!cased && Py_UNICODE_ISUPPER(ch))
6785 cased = 1;
6786 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006787 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006788}
6789
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006790PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006791"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006792\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006793Return True if S is a titlecased string and there is at least one\n\
6794character in S, i.e. upper- and titlecase characters may only\n\
6795follow uncased characters and lowercase characters only cased ones.\n\
6796Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006797
6798static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006799unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006800{
6801 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6802 register const Py_UNICODE *e;
6803 int cased, previous_is_cased;
6804
Guido van Rossumd57fd912000-03-10 22:53:23 +00006805 /* Shortcut for single character strings */
6806 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006807 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6808 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006809
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006810 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006811 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006812 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006813
Guido van Rossumd57fd912000-03-10 22:53:23 +00006814 e = p + PyUnicode_GET_SIZE(self);
6815 cased = 0;
6816 previous_is_cased = 0;
6817 for (; p < e; p++) {
6818 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006819
Guido van Rossumd57fd912000-03-10 22:53:23 +00006820 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6821 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006822 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006823 previous_is_cased = 1;
6824 cased = 1;
6825 }
6826 else if (Py_UNICODE_ISLOWER(ch)) {
6827 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006828 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006829 previous_is_cased = 1;
6830 cased = 1;
6831 }
6832 else
6833 previous_is_cased = 0;
6834 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006835 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006836}
6837
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006838PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006839"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006840\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006841Return True if all characters in S are whitespace\n\
6842and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006843
6844static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006845unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006846{
6847 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6848 register const Py_UNICODE *e;
6849
Guido van Rossumd57fd912000-03-10 22:53:23 +00006850 /* Shortcut for single character strings */
6851 if (PyUnicode_GET_SIZE(self) == 1 &&
6852 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006853 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006854
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006855 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006856 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006857 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006858
Guido van Rossumd57fd912000-03-10 22:53:23 +00006859 e = p + PyUnicode_GET_SIZE(self);
6860 for (; p < e; p++) {
6861 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006862 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006863 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006864 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006865}
6866
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006867PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006868"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006869\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006870Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006871and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006872
6873static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006874unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006875{
6876 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6877 register const Py_UNICODE *e;
6878
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006879 /* Shortcut for single character strings */
6880 if (PyUnicode_GET_SIZE(self) == 1 &&
6881 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006882 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006883
6884 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006885 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006886 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006887
6888 e = p + PyUnicode_GET_SIZE(self);
6889 for (; p < e; p++) {
6890 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006891 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006892 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006893 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006894}
6895
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006896PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006897"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006898\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006899Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006900and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006901
6902static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006903unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006904{
6905 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6906 register const Py_UNICODE *e;
6907
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006908 /* Shortcut for single character strings */
6909 if (PyUnicode_GET_SIZE(self) == 1 &&
6910 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006911 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006912
6913 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006914 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006915 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006916
6917 e = p + PyUnicode_GET_SIZE(self);
6918 for (; p < e; p++) {
6919 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006920 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006921 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006922 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006923}
6924
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006925PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006926"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006927\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006928Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006929False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006930
6931static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006932unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006933{
6934 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6935 register const Py_UNICODE *e;
6936
Guido van Rossumd57fd912000-03-10 22:53:23 +00006937 /* Shortcut for single character strings */
6938 if (PyUnicode_GET_SIZE(self) == 1 &&
6939 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006940 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006941
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006942 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006943 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006944 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006945
Guido van Rossumd57fd912000-03-10 22:53:23 +00006946 e = p + PyUnicode_GET_SIZE(self);
6947 for (; p < e; p++) {
6948 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006949 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006950 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006951 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006952}
6953
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006954PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006955"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006956\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006957Return True if all characters in S are digits\n\
6958and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006959
6960static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006961unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006962{
6963 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6964 register const Py_UNICODE *e;
6965
Guido van Rossumd57fd912000-03-10 22:53:23 +00006966 /* Shortcut for single character strings */
6967 if (PyUnicode_GET_SIZE(self) == 1 &&
6968 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006969 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006970
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006971 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006972 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006973 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006974
Guido van Rossumd57fd912000-03-10 22:53:23 +00006975 e = p + PyUnicode_GET_SIZE(self);
6976 for (; p < e; p++) {
6977 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006978 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006979 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006980 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006981}
6982
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006983PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006984"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006985\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006986Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006987False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006988
6989static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006990unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006991{
6992 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6993 register const Py_UNICODE *e;
6994
Guido van Rossumd57fd912000-03-10 22:53:23 +00006995 /* Shortcut for single character strings */
6996 if (PyUnicode_GET_SIZE(self) == 1 &&
6997 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006998 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006999
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007000 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007001 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007002 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007003
Guido van Rossumd57fd912000-03-10 22:53:23 +00007004 e = p + PyUnicode_GET_SIZE(self);
7005 for (; p < e; p++) {
7006 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007007 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007008 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007009 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007010}
7011
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007012PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007013"S.join(sequence) -> unicode\n\
7014\n\
7015Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007016sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007017
7018static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007019unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007020{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007021 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007022}
7023
Martin v. Löwis18e16552006-02-15 17:27:45 +00007024static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007025unicode_length(PyUnicodeObject *self)
7026{
7027 return self->length;
7028}
7029
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007030PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00007031"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007032\n\
7033Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007034done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007035
7036static PyObject *
7037unicode_ljust(PyUnicodeObject *self, PyObject *args)
7038{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007039 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007040 Py_UNICODE fillchar = ' ';
7041
Martin v. Löwis412fb672006-04-13 06:34:32 +00007042 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007043 return NULL;
7044
Tim Peters7a29bd52001-09-12 03:03:31 +00007045 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007046 Py_INCREF(self);
7047 return (PyObject*) self;
7048 }
7049
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007050 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007051}
7052
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007053PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007054"S.lower() -> unicode\n\
7055\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007056Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007057
7058static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007059unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007060{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007061 return fixup(self, fixlower);
7062}
7063
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007064#define LEFTSTRIP 0
7065#define RIGHTSTRIP 1
7066#define BOTHSTRIP 2
7067
7068/* Arrays indexed by above */
7069static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7070
7071#define STRIPNAME(i) (stripformat[i]+3)
7072
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007073/* externally visible for str.strip(unicode) */
7074PyObject *
7075_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7076{
7077 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007078 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007079 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007080 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7081 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007082
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007083 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7084
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007085 i = 0;
7086 if (striptype != RIGHTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007087 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7088 i++;
7089 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007090 }
7091
7092 j = len;
7093 if (striptype != LEFTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007094 do {
7095 j--;
7096 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7097 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007098 }
7099
7100 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007101 Py_INCREF(self);
7102 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007103 }
7104 else
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007105 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007106}
7107
Guido van Rossumd57fd912000-03-10 22:53:23 +00007108
7109static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007110do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007111{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007112 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007113 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007114
7115 i = 0;
7116 if (striptype != RIGHTSTRIP) {
7117 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7118 i++;
7119 }
7120 }
7121
7122 j = len;
7123 if (striptype != LEFTSTRIP) {
7124 do {
7125 j--;
7126 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7127 j++;
7128 }
7129
7130 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7131 Py_INCREF(self);
7132 return (PyObject*)self;
7133 }
7134 else
7135 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007136}
7137
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007138
7139static PyObject *
7140do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7141{
7142 PyObject *sep = NULL;
7143
7144 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7145 return NULL;
7146
7147 if (sep != NULL && sep != Py_None) {
7148 if (PyUnicode_Check(sep))
7149 return _PyUnicode_XStrip(self, striptype, sep);
7150 else if (PyString_Check(sep)) {
7151 PyObject *res;
7152 sep = PyUnicode_FromObject(sep);
7153 if (sep==NULL)
7154 return NULL;
7155 res = _PyUnicode_XStrip(self, striptype, sep);
7156 Py_DECREF(sep);
7157 return res;
7158 }
7159 else {
7160 PyErr_Format(PyExc_TypeError,
7161 "%s arg must be None, unicode or str",
7162 STRIPNAME(striptype));
7163 return NULL;
7164 }
7165 }
7166
7167 return do_strip(self, striptype);
7168}
7169
7170
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007171PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007172"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007173\n\
7174Return a copy of the string S with leading and trailing\n\
7175whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007176If chars is given and not None, remove characters in chars instead.\n\
7177If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007178
7179static PyObject *
7180unicode_strip(PyUnicodeObject *self, PyObject *args)
7181{
7182 if (PyTuple_GET_SIZE(args) == 0)
7183 return do_strip(self, BOTHSTRIP); /* Common case */
7184 else
7185 return do_argstrip(self, BOTHSTRIP, args);
7186}
7187
7188
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007189PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007190"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007191\n\
7192Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007193If chars is given and not None, remove characters in chars instead.\n\
7194If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007195
7196static PyObject *
7197unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7198{
7199 if (PyTuple_GET_SIZE(args) == 0)
7200 return do_strip(self, LEFTSTRIP); /* Common case */
7201 else
7202 return do_argstrip(self, LEFTSTRIP, args);
7203}
7204
7205
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007206PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007207"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007208\n\
7209Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007210If chars is given and not None, remove characters in chars instead.\n\
7211If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007212
7213static PyObject *
7214unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7215{
7216 if (PyTuple_GET_SIZE(args) == 0)
7217 return do_strip(self, RIGHTSTRIP); /* Common case */
7218 else
7219 return do_argstrip(self, RIGHTSTRIP, args);
7220}
7221
7222
Guido van Rossumd57fd912000-03-10 22:53:23 +00007223static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007224unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007225{
7226 PyUnicodeObject *u;
7227 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007228 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007229 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007230
7231 if (len < 0)
7232 len = 0;
7233
Tim Peters7a29bd52001-09-12 03:03:31 +00007234 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007235 /* no repeat, return original string */
7236 Py_INCREF(str);
7237 return (PyObject*) str;
7238 }
Tim Peters8f422462000-09-09 06:13:41 +00007239
7240 /* ensure # of chars needed doesn't overflow int and # of bytes
7241 * needed doesn't overflow size_t
7242 */
7243 nchars = len * str->length;
7244 if (len && nchars / len != str->length) {
7245 PyErr_SetString(PyExc_OverflowError,
7246 "repeated string is too long");
7247 return NULL;
7248 }
7249 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7250 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7251 PyErr_SetString(PyExc_OverflowError,
7252 "repeated string is too long");
7253 return NULL;
7254 }
7255 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007256 if (!u)
7257 return NULL;
7258
7259 p = u->str;
7260
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007261 if (str->length == 1 && len > 0) {
7262 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007263 } else {
Tim Peters1bacc642006-05-23 05:47:16 +00007264 Py_ssize_t done = 0; /* number of characters copied this far */
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007265 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007266 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007267 done = str->length;
7268 }
7269 while (done < nchars) {
Neal Norwitz4677fbf72008-03-25 04:18:18 +00007270 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007271 Py_UNICODE_COPY(p+done, p, n);
7272 done += n;
7273 }
7274 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007275
7276 return (PyObject*) u;
7277}
7278
7279PyObject *PyUnicode_Replace(PyObject *obj,
7280 PyObject *subobj,
7281 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007282 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007283{
7284 PyObject *self;
7285 PyObject *str1;
7286 PyObject *str2;
7287 PyObject *result;
7288
7289 self = PyUnicode_FromObject(obj);
7290 if (self == NULL)
7291 return NULL;
7292 str1 = PyUnicode_FromObject(subobj);
7293 if (str1 == NULL) {
7294 Py_DECREF(self);
7295 return NULL;
7296 }
7297 str2 = PyUnicode_FromObject(replobj);
7298 if (str2 == NULL) {
7299 Py_DECREF(self);
7300 Py_DECREF(str1);
7301 return NULL;
7302 }
Tim Petersced69f82003-09-16 20:30:58 +00007303 result = replace((PyUnicodeObject *)self,
7304 (PyUnicodeObject *)str1,
7305 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007306 maxcount);
7307 Py_DECREF(self);
7308 Py_DECREF(str1);
7309 Py_DECREF(str2);
7310 return result;
7311}
7312
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007313PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007314"S.replace (old, new[, maxsplit]) -> unicode\n\
7315\n\
7316Return a copy of S with all occurrences of substring\n\
7317old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007318given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007319
7320static PyObject*
7321unicode_replace(PyUnicodeObject *self, PyObject *args)
7322{
7323 PyUnicodeObject *str1;
7324 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007325 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007326 PyObject *result;
7327
Martin v. Löwis18e16552006-02-15 17:27:45 +00007328 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007329 return NULL;
7330 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7331 if (str1 == NULL)
7332 return NULL;
7333 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007334 if (str2 == NULL) {
7335 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007336 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007337 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007338
7339 result = replace(self, str1, str2, maxcount);
7340
7341 Py_DECREF(str1);
7342 Py_DECREF(str2);
7343 return result;
7344}
7345
7346static
7347PyObject *unicode_repr(PyObject *unicode)
7348{
7349 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
7350 PyUnicode_GET_SIZE(unicode),
7351 1);
7352}
7353
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007354PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007355"S.rfind(sub [,start [,end]]) -> int\n\
7356\n\
7357Return the highest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00007358such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007359arguments start and end are interpreted as in slice notation.\n\
7360\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007361Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007362
7363static PyObject *
7364unicode_rfind(PyUnicodeObject *self, PyObject *args)
7365{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007366 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007367 Py_ssize_t start;
7368 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007369 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007370
Facundo Batista57d56692007-11-16 18:04:14 +00007371 if (!_ParseTupleFinds(args, &substring, &start, &end))
7372 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007373
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007374 result = stringlib_rfind_slice(
7375 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7376 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7377 start, end
7378 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007379
7380 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007381
7382 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007383}
7384
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007385PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007386"S.rindex(sub [,start [,end]]) -> int\n\
7387\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007388Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007389
7390static PyObject *
7391unicode_rindex(PyUnicodeObject *self, PyObject *args)
7392{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007393 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007394 Py_ssize_t start;
7395 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007396 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007397
Facundo Batista57d56692007-11-16 18:04:14 +00007398 if (!_ParseTupleFinds(args, &substring, &start, &end))
7399 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007400
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007401 result = stringlib_rfind_slice(
7402 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7403 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7404 start, end
7405 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007406
7407 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007408
Guido van Rossumd57fd912000-03-10 22:53:23 +00007409 if (result < 0) {
7410 PyErr_SetString(PyExc_ValueError, "substring not found");
7411 return NULL;
7412 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007413 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007414}
7415
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007416PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007417"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007418\n\
7419Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007420done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007421
7422static PyObject *
7423unicode_rjust(PyUnicodeObject *self, PyObject *args)
7424{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007425 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007426 Py_UNICODE fillchar = ' ';
7427
Martin v. Löwis412fb672006-04-13 06:34:32 +00007428 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007429 return NULL;
7430
Tim Peters7a29bd52001-09-12 03:03:31 +00007431 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007432 Py_INCREF(self);
7433 return (PyObject*) self;
7434 }
7435
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007436 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007437}
7438
Guido van Rossumd57fd912000-03-10 22:53:23 +00007439static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007440unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007441{
7442 /* standard clamping */
7443 if (start < 0)
7444 start = 0;
7445 if (end < 0)
7446 end = 0;
7447 if (end > self->length)
7448 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007449 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007450 /* full slice, return original string */
7451 Py_INCREF(self);
7452 return (PyObject*) self;
7453 }
7454 if (start > end)
7455 start = end;
7456 /* copy slice */
7457 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7458 end - start);
7459}
7460
7461PyObject *PyUnicode_Split(PyObject *s,
7462 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007463 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007464{
7465 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007466
Guido van Rossumd57fd912000-03-10 22:53:23 +00007467 s = PyUnicode_FromObject(s);
7468 if (s == NULL)
7469 return NULL;
7470 if (sep != NULL) {
7471 sep = PyUnicode_FromObject(sep);
7472 if (sep == NULL) {
7473 Py_DECREF(s);
7474 return NULL;
7475 }
7476 }
7477
7478 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7479
7480 Py_DECREF(s);
7481 Py_XDECREF(sep);
7482 return result;
7483}
7484
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007485PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007486"S.split([sep [,maxsplit]]) -> list of strings\n\
7487\n\
7488Return a list of the words in S, using sep as the\n\
7489delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00007490splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00007491any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007492
7493static PyObject*
7494unicode_split(PyUnicodeObject *self, PyObject *args)
7495{
7496 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007497 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007498
Martin v. Löwis18e16552006-02-15 17:27:45 +00007499 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007500 return NULL;
7501
7502 if (substring == Py_None)
7503 return split(self, NULL, maxcount);
7504 else if (PyUnicode_Check(substring))
7505 return split(self, (PyUnicodeObject *)substring, maxcount);
7506 else
7507 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7508}
7509
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007510PyObject *
7511PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7512{
7513 PyObject* str_obj;
7514 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007515 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00007516
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007517 str_obj = PyUnicode_FromObject(str_in);
7518 if (!str_obj)
7519 return NULL;
7520 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00007521 if (!sep_obj) {
7522 Py_DECREF(str_obj);
7523 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007524 }
7525
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007526 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00007527 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7528 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7529 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007530
Fredrik Lundhb9479482006-05-26 17:22:38 +00007531 Py_DECREF(sep_obj);
7532 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007533
7534 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007535}
7536
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007537
7538PyObject *
7539PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7540{
7541 PyObject* str_obj;
7542 PyObject* sep_obj;
7543 PyObject* out;
7544
7545 str_obj = PyUnicode_FromObject(str_in);
7546 if (!str_obj)
7547 return NULL;
7548 sep_obj = PyUnicode_FromObject(sep_in);
7549 if (!sep_obj) {
7550 Py_DECREF(str_obj);
7551 return NULL;
7552 }
7553
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007554 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007555 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7556 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7557 );
7558
7559 Py_DECREF(sep_obj);
7560 Py_DECREF(str_obj);
7561
7562 return out;
7563}
7564
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007565PyDoc_STRVAR(partition__doc__,
7566"S.partition(sep) -> (head, sep, tail)\n\
7567\n\
7568Searches for the separator sep in S, and returns the part before it,\n\
7569the separator itself, and the part after it. If the separator is not\n\
7570found, returns S and two empty strings.");
7571
7572static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00007573unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007574{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007575 return PyUnicode_Partition((PyObject *)self, separator);
7576}
7577
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007578PyDoc_STRVAR(rpartition__doc__,
Raymond Hettingera0c95fa2006-09-04 15:32:48 +00007579"S.rpartition(sep) -> (tail, sep, head)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007580\n\
7581Searches for the separator sep in S, starting at the end of S, and returns\n\
7582the part before it, the separator itself, and the part after it. If the\n\
Raymond Hettingera0c95fa2006-09-04 15:32:48 +00007583separator is not found, returns two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007584
7585static PyObject*
7586unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7587{
7588 return PyUnicode_RPartition((PyObject *)self, separator);
7589}
7590
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007591PyObject *PyUnicode_RSplit(PyObject *s,
7592 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007593 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007594{
7595 PyObject *result;
7596
7597 s = PyUnicode_FromObject(s);
7598 if (s == NULL)
7599 return NULL;
7600 if (sep != NULL) {
7601 sep = PyUnicode_FromObject(sep);
7602 if (sep == NULL) {
7603 Py_DECREF(s);
7604 return NULL;
7605 }
7606 }
7607
7608 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7609
7610 Py_DECREF(s);
7611 Py_XDECREF(sep);
7612 return result;
7613}
7614
7615PyDoc_STRVAR(rsplit__doc__,
7616"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7617\n\
7618Return a list of the words in S, using sep as the\n\
7619delimiter string, starting at the end of the string and\n\
7620working to the front. If maxsplit is given, at most maxsplit\n\
7621splits are done. If sep is not specified, any whitespace string\n\
7622is a separator.");
7623
7624static PyObject*
7625unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7626{
7627 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007628 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007629
Martin v. Löwis18e16552006-02-15 17:27:45 +00007630 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007631 return NULL;
7632
7633 if (substring == Py_None)
7634 return rsplit(self, NULL, maxcount);
7635 else if (PyUnicode_Check(substring))
7636 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7637 else
7638 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7639}
7640
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007641PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007642"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007643\n\
7644Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007645Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007646is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007647
7648static PyObject*
7649unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7650{
Guido van Rossum86662912000-04-11 15:38:46 +00007651 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007652
Guido van Rossum86662912000-04-11 15:38:46 +00007653 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007654 return NULL;
7655
Guido van Rossum86662912000-04-11 15:38:46 +00007656 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007657}
7658
7659static
7660PyObject *unicode_str(PyUnicodeObject *self)
7661{
Fred Drakee4315f52000-05-09 19:53:39 +00007662 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007663}
7664
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007665PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007666"S.swapcase() -> unicode\n\
7667\n\
7668Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007669and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007670
7671static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007672unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007673{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007674 return fixup(self, fixswapcase);
7675}
7676
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007677PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007678"S.translate(table) -> unicode\n\
7679\n\
7680Return a copy of the string S, where all characters have been mapped\n\
7681through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007682Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7683Unmapped characters are left untouched. Characters mapped to None\n\
7684are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007685
7686static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007687unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007688{
Tim Petersced69f82003-09-16 20:30:58 +00007689 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007690 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00007691 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007692 "ignore");
7693}
7694
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007695PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007696"S.upper() -> unicode\n\
7697\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007698Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007699
7700static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007701unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007702{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007703 return fixup(self, fixupper);
7704}
7705
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007706PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007707"S.zfill(width) -> unicode\n\
7708\n\
7709Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007710of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007711
7712static PyObject *
7713unicode_zfill(PyUnicodeObject *self, PyObject *args)
7714{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007715 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007716 PyUnicodeObject *u;
7717
Martin v. Löwis18e16552006-02-15 17:27:45 +00007718 Py_ssize_t width;
7719 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007720 return NULL;
7721
7722 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007723 if (PyUnicode_CheckExact(self)) {
7724 Py_INCREF(self);
7725 return (PyObject*) self;
7726 }
7727 else
7728 return PyUnicode_FromUnicode(
7729 PyUnicode_AS_UNICODE(self),
7730 PyUnicode_GET_SIZE(self)
7731 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007732 }
7733
7734 fill = width - self->length;
7735
7736 u = pad(self, fill, 0, '0');
7737
Walter Dörwald068325e2002-04-15 13:36:47 +00007738 if (u == NULL)
7739 return NULL;
7740
Guido van Rossumd57fd912000-03-10 22:53:23 +00007741 if (u->str[fill] == '+' || u->str[fill] == '-') {
7742 /* move sign to beginning of string */
7743 u->str[0] = u->str[fill];
7744 u->str[fill] = '0';
7745 }
7746
7747 return (PyObject*) u;
7748}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007749
7750#if 0
7751static PyObject*
Christian Heimes5b970ad2008-02-06 13:33:44 +00007752free_listsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007753{
Christian Heimes5b970ad2008-02-06 13:33:44 +00007754 return PyInt_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007755}
7756#endif
7757
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007758PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007759"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007760\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007761Return True if S starts with the specified prefix, False otherwise.\n\
7762With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007763With optional end, stop comparing S at that position.\n\
7764prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007765
7766static PyObject *
7767unicode_startswith(PyUnicodeObject *self,
7768 PyObject *args)
7769{
Georg Brandl24250812006-06-09 18:45:48 +00007770 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007771 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007772 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007773 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007774 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007775
Georg Brandl24250812006-06-09 18:45:48 +00007776 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007777 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007778 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007779 if (PyTuple_Check(subobj)) {
7780 Py_ssize_t i;
7781 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7782 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7783 PyTuple_GET_ITEM(subobj, i));
7784 if (substring == NULL)
7785 return NULL;
7786 result = tailmatch(self, substring, start, end, -1);
7787 Py_DECREF(substring);
7788 if (result) {
7789 Py_RETURN_TRUE;
7790 }
7791 }
7792 /* nothing matched */
7793 Py_RETURN_FALSE;
7794 }
7795 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007796 if (substring == NULL)
Georg Brandl24250812006-06-09 18:45:48 +00007797 return NULL;
7798 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007799 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007800 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007801}
7802
7803
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007804PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007805"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007806\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007807Return True if S ends with the specified suffix, False otherwise.\n\
7808With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007809With optional end, stop comparing S at that position.\n\
7810suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007811
7812static PyObject *
7813unicode_endswith(PyUnicodeObject *self,
7814 PyObject *args)
7815{
Georg Brandl24250812006-06-09 18:45:48 +00007816 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007817 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007818 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007819 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007820 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007821
Georg Brandl24250812006-06-09 18:45:48 +00007822 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7823 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007824 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007825 if (PyTuple_Check(subobj)) {
7826 Py_ssize_t i;
7827 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7828 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7829 PyTuple_GET_ITEM(subobj, i));
7830 if (substring == NULL)
7831 return NULL;
7832 result = tailmatch(self, substring, start, end, +1);
7833 Py_DECREF(substring);
7834 if (result) {
7835 Py_RETURN_TRUE;
7836 }
7837 }
7838 Py_RETURN_FALSE;
7839 }
7840 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007841 if (substring == NULL)
Georg Brandl24250812006-06-09 18:45:48 +00007842 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007843
Georg Brandl24250812006-06-09 18:45:48 +00007844 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007845 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007846 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007847}
7848
7849
Eric Smitha9f7d622008-02-17 19:46:49 +00007850/* Implements do_string_format, which is unicode because of stringlib */
7851#include "stringlib/string_format.h"
7852
7853PyDoc_STRVAR(format__doc__,
7854"S.format(*args, **kwargs) -> unicode\n\
7855\n\
7856");
7857
7858PyDoc_STRVAR(p_format__doc__,
7859"S.__format__(format_spec) -> unicode\n\
7860\n\
7861");
7862
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007863
7864static PyObject *
7865unicode_getnewargs(PyUnicodeObject *v)
7866{
7867 return Py_BuildValue("(u#)", v->str, v->length);
7868}
7869
7870
Guido van Rossumd57fd912000-03-10 22:53:23 +00007871static PyMethodDef unicode_methods[] = {
7872
7873 /* Order is according to common usage: often used methods should
7874 appear first, since lookup is done sequentially. */
7875
Georg Brandlecdc0a92006-03-30 12:19:07 +00007876 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007877 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7878 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007879 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007880 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7881 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7882 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7883 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7884 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7885 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7886 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00007887 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007888 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7889 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7890 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007891 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007892 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007893/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7894 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7895 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7896 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007897 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007898 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007899 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007900 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007901 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7902 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7903 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7904 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7905 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7906 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7907 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7908 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7909 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7910 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7911 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7912 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7913 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7914 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007915 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smitha9f7d622008-02-17 19:46:49 +00007916 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7917 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
7918 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
7919 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Walter Dörwald068325e2002-04-15 13:36:47 +00007920#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007921 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007922#endif
7923
7924#if 0
7925 /* This one is just used for debugging the implementation. */
Christian Heimes5b970ad2008-02-06 13:33:44 +00007926 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007927#endif
7928
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007929 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007930 {NULL, NULL}
7931};
7932
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007933static PyObject *
7934unicode_mod(PyObject *v, PyObject *w)
7935{
7936 if (!PyUnicode_Check(v)) {
7937 Py_INCREF(Py_NotImplemented);
7938 return Py_NotImplemented;
7939 }
7940 return PyUnicode_Format(v, w);
7941}
7942
7943static PyNumberMethods unicode_as_number = {
7944 0, /*nb_add*/
7945 0, /*nb_subtract*/
7946 0, /*nb_multiply*/
7947 0, /*nb_divide*/
7948 unicode_mod, /*nb_remainder*/
7949};
7950
Guido van Rossumd57fd912000-03-10 22:53:23 +00007951static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007952 (lenfunc) unicode_length, /* sq_length */
Georg Brandl347b3002006-03-30 11:57:00 +00007953 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007954 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7955 (ssizeargfunc) unicode_getitem, /* sq_item */
7956 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007957 0, /* sq_ass_item */
7958 0, /* sq_ass_slice */
Georg Brandl347b3002006-03-30 11:57:00 +00007959 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007960};
7961
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007962static PyObject*
7963unicode_subscript(PyUnicodeObject* self, PyObject* item)
7964{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00007965 if (PyIndex_Check(item)) {
7966 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007967 if (i == -1 && PyErr_Occurred())
7968 return NULL;
7969 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007970 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007971 return unicode_getitem(self, i);
7972 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007973 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007974 Py_UNICODE* source_buf;
7975 Py_UNICODE* result_buf;
7976 PyObject* result;
7977
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007978 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007979 &start, &stop, &step, &slicelength) < 0) {
7980 return NULL;
7981 }
7982
7983 if (slicelength <= 0) {
7984 return PyUnicode_FromUnicode(NULL, 0);
Thomas Wouters3ccec682007-08-28 15:28:19 +00007985 } else if (start == 0 && step == 1 && slicelength == self->length &&
7986 PyUnicode_CheckExact(self)) {
7987 Py_INCREF(self);
7988 return (PyObject *)self;
7989 } else if (step == 1) {
7990 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007991 } else {
7992 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Neal Norwitz419fd492008-03-17 20:22:43 +00007993 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
7994 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007995
7996 if (result_buf == NULL)
7997 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007998
7999 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8000 result_buf[i] = source_buf[cur];
8001 }
Tim Petersced69f82003-09-16 20:30:58 +00008002
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008003 result = PyUnicode_FromUnicode(result_buf, slicelength);
Neal Norwitz419fd492008-03-17 20:22:43 +00008004 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008005 return result;
8006 }
8007 } else {
8008 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8009 return NULL;
8010 }
8011}
8012
8013static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008014 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008015 (binaryfunc)unicode_subscript, /* mp_subscript */
8016 (objobjargproc)0, /* mp_ass_subscript */
8017};
8018
Martin v. Löwis18e16552006-02-15 17:27:45 +00008019static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008020unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00008021 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008022 const void **ptr)
8023{
8024 if (index != 0) {
8025 PyErr_SetString(PyExc_SystemError,
8026 "accessing non-existent unicode segment");
8027 return -1;
8028 }
8029 *ptr = (void *) self->str;
8030 return PyUnicode_GET_DATA_SIZE(self);
8031}
8032
Martin v. Löwis18e16552006-02-15 17:27:45 +00008033static Py_ssize_t
8034unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008035 const void **ptr)
8036{
8037 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00008038 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008039 return -1;
8040}
8041
8042static int
8043unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00008044 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008045{
8046 if (lenp)
8047 *lenp = PyUnicode_GET_DATA_SIZE(self);
8048 return 1;
8049}
8050
Martin v. Löwiseb079f12006-02-16 14:32:27 +00008051static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008052unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00008053 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008054 const void **ptr)
8055{
8056 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00008057
Guido van Rossumd57fd912000-03-10 22:53:23 +00008058 if (index != 0) {
8059 PyErr_SetString(PyExc_SystemError,
8060 "accessing non-existent unicode segment");
8061 return -1;
8062 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008063 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008064 if (str == NULL)
8065 return -1;
8066 *ptr = (void *) PyString_AS_STRING(str);
8067 return PyString_GET_SIZE(str);
8068}
8069
8070/* Helpers for PyUnicode_Format() */
8071
8072static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008073getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008074{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008075 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008076 if (argidx < arglen) {
8077 (*p_argidx)++;
8078 if (arglen < 0)
8079 return args;
8080 else
8081 return PyTuple_GetItem(args, argidx);
8082 }
8083 PyErr_SetString(PyExc_TypeError,
8084 "not enough arguments for format string");
8085 return NULL;
8086}
8087
8088#define F_LJUST (1<<0)
8089#define F_SIGN (1<<1)
8090#define F_BLANK (1<<2)
8091#define F_ALT (1<<3)
8092#define F_ZERO (1<<4)
8093
Martin v. Löwis18e16552006-02-15 17:27:45 +00008094static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008095strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008096{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008097 register Py_ssize_t i;
8098 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008099 for (i = len - 1; i >= 0; i--)
8100 buffer[i] = (Py_UNICODE) charbuffer[i];
8101
Guido van Rossumd57fd912000-03-10 22:53:23 +00008102 return len;
8103}
8104
Neal Norwitzfc76d632006-01-10 06:03:13 +00008105static int
8106doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8107{
Tim Peters15231542006-02-16 01:08:01 +00008108 Py_ssize_t result;
8109
Neal Norwitzfc76d632006-01-10 06:03:13 +00008110 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008111 result = strtounicode(buffer, (char *)buffer);
8112 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008113}
8114
8115static int
8116longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8117{
Tim Peters15231542006-02-16 01:08:01 +00008118 Py_ssize_t result;
8119
Neal Norwitzfc76d632006-01-10 06:03:13 +00008120 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008121 result = strtounicode(buffer, (char *)buffer);
8122 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008123}
8124
Guido van Rossum078151d2002-08-11 04:24:12 +00008125/* XXX To save some code duplication, formatfloat/long/int could have been
8126 shared with stringobject.c, converting from 8-bit to Unicode after the
8127 formatting is done. */
8128
Guido van Rossumd57fd912000-03-10 22:53:23 +00008129static int
8130formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008131 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008132 int flags,
8133 int prec,
8134 int type,
8135 PyObject *v)
8136{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008137 /* fmt = '%#.' + `prec` + `type`
8138 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00008139 char fmt[20];
8140 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008141
Guido van Rossumd57fd912000-03-10 22:53:23 +00008142 x = PyFloat_AsDouble(v);
8143 if (x == -1.0 && PyErr_Occurred())
8144 return -1;
8145 if (prec < 0)
8146 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008147 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8148 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008149 /* Worst case length calc to ensure no buffer overrun:
8150
8151 'g' formats:
8152 fmt = %#.<prec>g
8153 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8154 for any double rep.)
8155 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8156
8157 'f' formats:
8158 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8159 len = 1 + 50 + 1 + prec = 52 + prec
8160
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008161 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008162 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008163
8164 */
Georg Brandl7c3b50d2007-07-12 08:38:00 +00008165 if (((type == 'g' || type == 'G') &&
8166 buflen <= (size_t)10 + (size_t)prec) ||
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008167 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008168 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008169 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008170 return -1;
8171 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008172 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8173 (flags&F_ALT) ? "#" : "",
8174 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008175 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008176}
8177
Tim Peters38fd5b62000-09-21 05:43:11 +00008178static PyObject*
8179formatlong(PyObject *val, int flags, int prec, int type)
8180{
8181 char *buf;
8182 int i, len;
8183 PyObject *str; /* temporary string object. */
8184 PyUnicodeObject *result;
8185
8186 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8187 if (!str)
8188 return NULL;
8189 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008190 if (!result) {
8191 Py_DECREF(str);
8192 return NULL;
8193 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008194 for (i = 0; i < len; i++)
8195 result->str[i] = buf[i];
8196 result->str[len] = 0;
8197 Py_DECREF(str);
8198 return (PyObject*)result;
8199}
8200
Guido van Rossumd57fd912000-03-10 22:53:23 +00008201static int
8202formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008203 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008204 int flags,
8205 int prec,
8206 int type,
8207 PyObject *v)
8208{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008209 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008210 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8211 * + 1 + 1
8212 * = 24
8213 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008214 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008215 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008216 long x;
8217
8218 x = PyInt_AsLong(v);
8219 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008220 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008221 if (x < 0 && type == 'u') {
8222 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008223 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008224 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8225 sign = "-";
8226 else
8227 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008228 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008229 prec = 1;
8230
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008231 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8232 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008233 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008234 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008235 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008236 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008237 return -1;
8238 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008239
8240 if ((flags & F_ALT) &&
8241 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00008242 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008243 * of issues that cause pain:
8244 * - when 0 is being converted, the C standard leaves off
8245 * the '0x' or '0X', which is inconsistent with other
8246 * %#x/%#X conversions and inconsistent with Python's
8247 * hex() function
8248 * - there are platforms that violate the standard and
8249 * convert 0 with the '0x' or '0X'
8250 * (Metrowerks, Compaq Tru64)
8251 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008252 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008253 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008254 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008255 * We can achieve the desired consistency by inserting our
8256 * own '0x' or '0X' prefix, and substituting %x/%X in place
8257 * of %#x/%#X.
8258 *
8259 * Note that this is the same approach as used in
8260 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008261 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008262 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8263 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008264 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008265 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008266 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8267 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008268 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008269 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008270 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008271 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008272 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008273 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008274}
8275
8276static int
8277formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008278 size_t buflen,
8279 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008280{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008281 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008282 if (PyUnicode_Check(v)) {
8283 if (PyUnicode_GET_SIZE(v) != 1)
8284 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008285 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008286 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008287
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008288 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00008289 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008290 goto onError;
8291 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
8292 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008293
8294 else {
8295 /* Integer input truncated to a character */
8296 long x;
8297 x = PyInt_AsLong(v);
8298 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008299 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008300#ifdef Py_UNICODE_WIDE
8301 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008302 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008303 "%c arg not in range(0x110000) "
8304 "(wide Python build)");
8305 return -1;
8306 }
8307#else
8308 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008309 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008310 "%c arg not in range(0x10000) "
8311 "(narrow Python build)");
8312 return -1;
8313 }
8314#endif
8315 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008316 }
8317 buf[1] = '\0';
8318 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008319
8320 onError:
8321 PyErr_SetString(PyExc_TypeError,
8322 "%c requires int or char");
8323 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008324}
8325
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008326/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8327
8328 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8329 chars are formatted. XXX This is a magic number. Each formatting
8330 routine does bounds checking to ensure no overflow, but a better
8331 solution may be to malloc a buffer of appropriate size for each
8332 format. For now, the current solution is sufficient.
8333*/
8334#define FORMATBUFLEN (size_t)120
8335
Guido van Rossumd57fd912000-03-10 22:53:23 +00008336PyObject *PyUnicode_Format(PyObject *format,
8337 PyObject *args)
8338{
8339 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008340 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008341 int args_owned = 0;
8342 PyUnicodeObject *result = NULL;
8343 PyObject *dict = NULL;
8344 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008345
Guido van Rossumd57fd912000-03-10 22:53:23 +00008346 if (format == NULL || args == NULL) {
8347 PyErr_BadInternalCall();
8348 return NULL;
8349 }
8350 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008351 if (uformat == NULL)
8352 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008353 fmt = PyUnicode_AS_UNICODE(uformat);
8354 fmtcnt = PyUnicode_GET_SIZE(uformat);
8355
8356 reslen = rescnt = fmtcnt + 100;
8357 result = _PyUnicode_New(reslen);
8358 if (result == NULL)
8359 goto onError;
8360 res = PyUnicode_AS_UNICODE(result);
8361
8362 if (PyTuple_Check(args)) {
8363 arglen = PyTuple_Size(args);
8364 argidx = 0;
8365 }
8366 else {
8367 arglen = -1;
8368 argidx = -2;
8369 }
Christian Heimese93237d2007-12-19 02:37:44 +00008370 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008371 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008372 dict = args;
8373
8374 while (--fmtcnt >= 0) {
8375 if (*fmt != '%') {
8376 if (--rescnt < 0) {
8377 rescnt = fmtcnt + 100;
8378 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008379 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008380 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008381 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8382 --rescnt;
8383 }
8384 *res++ = *fmt++;
8385 }
8386 else {
8387 /* Got a format specifier */
8388 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008389 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008390 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008391 Py_UNICODE c = '\0';
8392 Py_UNICODE fill;
Facundo Batistac11cecf2008-02-24 03:17:21 +00008393 int isnumok;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008394 PyObject *v = NULL;
8395 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008396 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008397 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008398 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008399 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008400
8401 fmt++;
8402 if (*fmt == '(') {
8403 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008404 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008405 PyObject *key;
8406 int pcount = 1;
8407
8408 if (dict == NULL) {
8409 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008410 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008411 goto onError;
8412 }
8413 ++fmt;
8414 --fmtcnt;
8415 keystart = fmt;
8416 /* Skip over balanced parentheses */
8417 while (pcount > 0 && --fmtcnt >= 0) {
8418 if (*fmt == ')')
8419 --pcount;
8420 else if (*fmt == '(')
8421 ++pcount;
8422 fmt++;
8423 }
8424 keylen = fmt - keystart - 1;
8425 if (fmtcnt < 0 || pcount > 0) {
8426 PyErr_SetString(PyExc_ValueError,
8427 "incomplete format key");
8428 goto onError;
8429 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008430#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008431 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008432 then looked up since Python uses strings to hold
8433 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008434 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008435 key = PyUnicode_EncodeUTF8(keystart,
8436 keylen,
8437 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008438#else
8439 key = PyUnicode_FromUnicode(keystart, keylen);
8440#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008441 if (key == NULL)
8442 goto onError;
8443 if (args_owned) {
8444 Py_DECREF(args);
8445 args_owned = 0;
8446 }
8447 args = PyObject_GetItem(dict, key);
8448 Py_DECREF(key);
8449 if (args == NULL) {
8450 goto onError;
8451 }
8452 args_owned = 1;
8453 arglen = -1;
8454 argidx = -2;
8455 }
8456 while (--fmtcnt >= 0) {
8457 switch (c = *fmt++) {
8458 case '-': flags |= F_LJUST; continue;
8459 case '+': flags |= F_SIGN; continue;
8460 case ' ': flags |= F_BLANK; continue;
8461 case '#': flags |= F_ALT; continue;
8462 case '0': flags |= F_ZERO; continue;
8463 }
8464 break;
8465 }
8466 if (c == '*') {
8467 v = getnextarg(args, arglen, &argidx);
8468 if (v == NULL)
8469 goto onError;
8470 if (!PyInt_Check(v)) {
8471 PyErr_SetString(PyExc_TypeError,
8472 "* wants int");
8473 goto onError;
8474 }
8475 width = PyInt_AsLong(v);
8476 if (width < 0) {
8477 flags |= F_LJUST;
8478 width = -width;
8479 }
8480 if (--fmtcnt >= 0)
8481 c = *fmt++;
8482 }
8483 else if (c >= '0' && c <= '9') {
8484 width = c - '0';
8485 while (--fmtcnt >= 0) {
8486 c = *fmt++;
8487 if (c < '0' || c > '9')
8488 break;
8489 if ((width*10) / 10 != width) {
8490 PyErr_SetString(PyExc_ValueError,
8491 "width too big");
8492 goto onError;
8493 }
8494 width = width*10 + (c - '0');
8495 }
8496 }
8497 if (c == '.') {
8498 prec = 0;
8499 if (--fmtcnt >= 0)
8500 c = *fmt++;
8501 if (c == '*') {
8502 v = getnextarg(args, arglen, &argidx);
8503 if (v == NULL)
8504 goto onError;
8505 if (!PyInt_Check(v)) {
8506 PyErr_SetString(PyExc_TypeError,
8507 "* wants int");
8508 goto onError;
8509 }
8510 prec = PyInt_AsLong(v);
8511 if (prec < 0)
8512 prec = 0;
8513 if (--fmtcnt >= 0)
8514 c = *fmt++;
8515 }
8516 else if (c >= '0' && c <= '9') {
8517 prec = c - '0';
8518 while (--fmtcnt >= 0) {
8519 c = Py_CHARMASK(*fmt++);
8520 if (c < '0' || c > '9')
8521 break;
8522 if ((prec*10) / 10 != prec) {
8523 PyErr_SetString(PyExc_ValueError,
8524 "prec too big");
8525 goto onError;
8526 }
8527 prec = prec*10 + (c - '0');
8528 }
8529 }
8530 } /* prec */
8531 if (fmtcnt >= 0) {
8532 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008533 if (--fmtcnt >= 0)
8534 c = *fmt++;
8535 }
8536 }
8537 if (fmtcnt < 0) {
8538 PyErr_SetString(PyExc_ValueError,
8539 "incomplete format");
8540 goto onError;
8541 }
8542 if (c != '%') {
8543 v = getnextarg(args, arglen, &argidx);
8544 if (v == NULL)
8545 goto onError;
8546 }
8547 sign = 0;
8548 fill = ' ';
8549 switch (c) {
8550
8551 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008552 pbuf = formatbuf;
8553 /* presume that buffer length is at least 1 */
8554 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008555 len = 1;
8556 break;
8557
8558 case 's':
8559 case 'r':
8560 if (PyUnicode_Check(v) && c == 's') {
8561 temp = v;
8562 Py_INCREF(temp);
8563 }
8564 else {
8565 PyObject *unicode;
8566 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008567 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008568 else
8569 temp = PyObject_Repr(v);
8570 if (temp == NULL)
8571 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008572 if (PyUnicode_Check(temp))
8573 /* nothing to do */;
8574 else if (PyString_Check(temp)) {
8575 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00008576 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00008577 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00008578 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008579 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008580 Py_DECREF(temp);
8581 temp = unicode;
8582 if (temp == NULL)
8583 goto onError;
8584 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008585 else {
8586 Py_DECREF(temp);
8587 PyErr_SetString(PyExc_TypeError,
8588 "%s argument has non-string str()");
8589 goto onError;
8590 }
8591 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008592 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008593 len = PyUnicode_GET_SIZE(temp);
8594 if (prec >= 0 && len > prec)
8595 len = prec;
8596 break;
8597
8598 case 'i':
8599 case 'd':
8600 case 'u':
8601 case 'o':
8602 case 'x':
8603 case 'X':
8604 if (c == 'i')
8605 c = 'd';
Facundo Batistac11cecf2008-02-24 03:17:21 +00008606 isnumok = 0;
8607 if (PyNumber_Check(v)) {
8608 PyObject *iobj=NULL;
8609
8610 if (PyInt_Check(v) || (PyLong_Check(v))) {
8611 iobj = v;
8612 Py_INCREF(iobj);
8613 }
8614 else {
8615 iobj = PyNumber_Int(v);
8616 if (iobj==NULL) iobj = PyNumber_Long(v);
8617 }
8618 if (iobj!=NULL) {
8619 if (PyInt_Check(iobj)) {
8620 isnumok = 1;
8621 pbuf = formatbuf;
8622 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8623 flags, prec, c, iobj);
8624 Py_DECREF(iobj);
8625 if (len < 0)
8626 goto onError;
8627 sign = 1;
8628 }
8629 else if (PyLong_Check(iobj)) {
8630 isnumok = 1;
8631 temp = formatlong(iobj, flags, prec, c);
8632 Py_DECREF(iobj);
8633 if (!temp)
8634 goto onError;
8635 pbuf = PyUnicode_AS_UNICODE(temp);
8636 len = PyUnicode_GET_SIZE(temp);
8637 sign = 1;
8638 }
8639 else {
8640 Py_DECREF(iobj);
8641 }
8642 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008643 }
Facundo Batistac11cecf2008-02-24 03:17:21 +00008644 if (!isnumok) {
8645 PyErr_Format(PyExc_TypeError,
8646 "%%%c format: a number is required, "
8647 "not %.200s", c, Py_TYPE(v)->tp_name);
Tim Peters38fd5b62000-09-21 05:43:11 +00008648 goto onError;
Tim Peters38fd5b62000-09-21 05:43:11 +00008649 }
8650 if (flags & F_ZERO)
8651 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008652 break;
8653
8654 case 'e':
8655 case 'E':
8656 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008657 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008658 case 'g':
8659 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008660 if (c == 'F')
8661 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008662 pbuf = formatbuf;
8663 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8664 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008665 if (len < 0)
8666 goto onError;
8667 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008668 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008669 fill = '0';
8670 break;
8671
8672 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008673 pbuf = formatbuf;
8674 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008675 if (len < 0)
8676 goto onError;
8677 break;
8678
8679 default:
8680 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008681 "unsupported format character '%c' (0x%x) "
Armin Rigo7ccbca92006-10-04 12:17:45 +00008682 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008683 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008684 (int)c,
Armin Rigo7ccbca92006-10-04 12:17:45 +00008685 (Py_ssize_t)(fmt - 1 -
8686 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008687 goto onError;
8688 }
8689 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008690 if (*pbuf == '-' || *pbuf == '+') {
8691 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008692 len--;
8693 }
8694 else if (flags & F_SIGN)
8695 sign = '+';
8696 else if (flags & F_BLANK)
8697 sign = ' ';
8698 else
8699 sign = 0;
8700 }
8701 if (width < len)
8702 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008703 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008704 reslen -= rescnt;
8705 rescnt = width + fmtcnt + 100;
8706 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008707 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008708 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008709 PyErr_NoMemory();
8710 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008711 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008712 if (_PyUnicode_Resize(&result, reslen) < 0) {
8713 Py_XDECREF(temp);
8714 goto onError;
8715 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008716 res = PyUnicode_AS_UNICODE(result)
8717 + reslen - rescnt;
8718 }
8719 if (sign) {
8720 if (fill != ' ')
8721 *res++ = sign;
8722 rescnt--;
8723 if (width > len)
8724 width--;
8725 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008726 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8727 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008728 assert(pbuf[1] == c);
8729 if (fill != ' ') {
8730 *res++ = *pbuf++;
8731 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008732 }
Tim Petersfff53252001-04-12 18:38:48 +00008733 rescnt -= 2;
8734 width -= 2;
8735 if (width < 0)
8736 width = 0;
8737 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008738 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008739 if (width > len && !(flags & F_LJUST)) {
8740 do {
8741 --rescnt;
8742 *res++ = fill;
8743 } while (--width > len);
8744 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008745 if (fill == ' ') {
8746 if (sign)
8747 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00008748 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008749 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008750 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008751 *res++ = *pbuf++;
8752 *res++ = *pbuf++;
8753 }
8754 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008755 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008756 res += len;
8757 rescnt -= len;
8758 while (--width >= len) {
8759 --rescnt;
8760 *res++ = ' ';
8761 }
8762 if (dict && (argidx < arglen) && c != '%') {
8763 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008764 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008765 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008766 goto onError;
8767 }
8768 Py_XDECREF(temp);
8769 } /* '%' */
8770 } /* until end */
8771 if (argidx < arglen && !dict) {
8772 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008773 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008774 goto onError;
8775 }
8776
Thomas Woutersa96affe2006-03-12 00:29:36 +00008777 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8778 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008779 if (args_owned) {
8780 Py_DECREF(args);
8781 }
8782 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008783 return (PyObject *)result;
8784
8785 onError:
8786 Py_XDECREF(result);
8787 Py_DECREF(uformat);
8788 if (args_owned) {
8789 Py_DECREF(args);
8790 }
8791 return NULL;
8792}
8793
8794static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008795 (readbufferproc) unicode_buffer_getreadbuf,
8796 (writebufferproc) unicode_buffer_getwritebuf,
8797 (segcountproc) unicode_buffer_getsegcount,
8798 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008799};
8800
Jeremy Hylton938ace62002-07-17 16:30:39 +00008801static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008802unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8803
Tim Peters6d6c1a32001-08-02 04:15:00 +00008804static PyObject *
8805unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8806{
8807 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00008808 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008809 char *encoding = NULL;
8810 char *errors = NULL;
8811
Guido van Rossume023fe02001-08-30 03:12:59 +00008812 if (type != &PyUnicode_Type)
8813 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008814 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8815 kwlist, &x, &encoding, &errors))
8816 return NULL;
8817 if (x == NULL)
8818 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008819 if (encoding == NULL && errors == NULL)
8820 return PyObject_Unicode(x);
8821 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008822 return PyUnicode_FromEncodedObject(x, encoding, errors);
8823}
8824
Guido van Rossume023fe02001-08-30 03:12:59 +00008825static PyObject *
8826unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8827{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008828 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008829 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008830
8831 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8832 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8833 if (tmp == NULL)
8834 return NULL;
8835 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008836 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008837 if (pnew == NULL) {
8838 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008839 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008840 }
Neal Norwitz419fd492008-03-17 20:22:43 +00008841 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008842 if (pnew->str == NULL) {
8843 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008844 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008845 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008846 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008847 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008848 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8849 pnew->length = n;
8850 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008851 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008852 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008853}
8854
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008855PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00008856"unicode(string [, encoding[, errors]]) -> object\n\
8857\n\
8858Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008859encoding defaults to the current default string encoding.\n\
8860errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008861
Guido van Rossumd57fd912000-03-10 22:53:23 +00008862PyTypeObject PyUnicode_Type = {
Martin v. Löwis68192102007-07-21 06:55:02 +00008863 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008864 "unicode", /* tp_name */
8865 sizeof(PyUnicodeObject), /* tp_size */
8866 0, /* tp_itemsize */
8867 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008868 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008869 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008870 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008871 0, /* tp_setattr */
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00008872 0, /* tp_compare */
Georg Brandl347b3002006-03-30 11:57:00 +00008873 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008874 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008875 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008876 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008877 (hashfunc) unicode_hash, /* tp_hash*/
8878 0, /* tp_call*/
8879 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008880 PyObject_GenericGetAttr, /* tp_getattro */
8881 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008882 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008883 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
Neal Norwitzee3a1b52007-02-25 19:44:48 +00008884 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008885 unicode_doc, /* tp_doc */
8886 0, /* tp_traverse */
8887 0, /* tp_clear */
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00008888 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008889 0, /* tp_weaklistoffset */
8890 0, /* tp_iter */
8891 0, /* tp_iternext */
8892 unicode_methods, /* tp_methods */
8893 0, /* tp_members */
8894 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008895 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008896 0, /* tp_dict */
8897 0, /* tp_descr_get */
8898 0, /* tp_descr_set */
8899 0, /* tp_dictoffset */
8900 0, /* tp_init */
8901 0, /* tp_alloc */
8902 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008903 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008904};
8905
8906/* Initialize the Unicode implementation */
8907
Thomas Wouters78890102000-07-22 19:25:51 +00008908void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008909{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008910 int i;
8911
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008912 /* XXX - move this array to unicodectype.c ? */
8913 Py_UNICODE linebreak[] = {
8914 0x000A, /* LINE FEED */
8915 0x000D, /* CARRIAGE RETURN */
8916 0x001C, /* FILE SEPARATOR */
8917 0x001D, /* GROUP SEPARATOR */
8918 0x001E, /* RECORD SEPARATOR */
8919 0x0085, /* NEXT LINE */
8920 0x2028, /* LINE SEPARATOR */
8921 0x2029, /* PARAGRAPH SEPARATOR */
8922 };
8923
Fred Drakee4315f52000-05-09 19:53:39 +00008924 /* Init the implementation */
Christian Heimes5b970ad2008-02-06 13:33:44 +00008925 free_list = NULL;
8926 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008927 unicode_empty = _PyUnicode_New(0);
Neal Norwitze1fdb322006-07-21 05:32:28 +00008928 if (!unicode_empty)
8929 return;
8930
Marc-André Lemburg90e81472000-06-07 09:13:21 +00008931 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008932 for (i = 0; i < 256; i++)
8933 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008934 if (PyType_Ready(&PyUnicode_Type) < 0)
8935 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008936
8937 /* initialize the linebreak bloom filter */
8938 bloom_linebreak = make_bloom_mask(
8939 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8940 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00008941
8942 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008943}
8944
8945/* Finalize the Unicode implementation */
8946
Christian Heimes3b718a72008-02-14 12:47:33 +00008947int
8948PyUnicode_ClearFreeList(void)
8949{
8950 int freelist_size = numfree;
8951 PyUnicodeObject *u;
8952
8953 for (u = free_list; u != NULL;) {
8954 PyUnicodeObject *v = u;
8955 u = *(PyUnicodeObject **)u;
8956 if (v->str)
Neal Norwitz419fd492008-03-17 20:22:43 +00008957 PyObject_DEL(v->str);
Christian Heimes3b718a72008-02-14 12:47:33 +00008958 Py_XDECREF(v->defenc);
8959 PyObject_Del(v);
8960 numfree--;
8961 }
8962 free_list = NULL;
8963 assert(numfree == 0);
8964 return freelist_size;
8965}
8966
Guido van Rossumd57fd912000-03-10 22:53:23 +00008967void
Thomas Wouters78890102000-07-22 19:25:51 +00008968_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008969{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008970 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008971
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008972 Py_XDECREF(unicode_empty);
8973 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008974
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008975 for (i = 0; i < 256; i++) {
8976 if (unicode_latin1[i]) {
8977 Py_DECREF(unicode_latin1[i]);
8978 unicode_latin1[i] = NULL;
8979 }
8980 }
Christian Heimes3b718a72008-02-14 12:47:33 +00008981 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00008982}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008983
Anthony Baxterac6bd462006-04-13 02:06:09 +00008984#ifdef __cplusplus
8985}
8986#endif
8987
8988
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008989/*
8990Local variables:
8991c-basic-offset: 4
8992indent-tabs-mode: nil
8993End:
8994*/