blob: 43379728cbbae4354d7374ff7f6e700df0fd4bf2 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Eric Smitha9f7d622008-02-17 19:46:49 +000045#include "formatter_unicode.h"
46
Guido van Rossumd57fd912000-03-10 22:53:23 +000047#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000048#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000049
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000050#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000051#include <windows.h>
52#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000053
Guido van Rossumd57fd912000-03-10 22:53:23 +000054/* Limit for the Unicode object free list */
55
Christian Heimes5b970ad2008-02-06 13:33:44 +000056#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000057
58/* Limit for the Unicode object free list stay alive optimization.
59
60 The implementation will keep allocated Unicode memory intact for
61 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000062 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000063
Christian Heimes5b970ad2008-02-06 13:33:44 +000064 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000065 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000066 malloc()-overhead) bytes of unused garbage.
67
68 Setting the limit to 0 effectively turns the feature off.
69
Guido van Rossumfd4b9572000-04-10 13:51:10 +000070 Note: This is an experimental feature ! If you get core dumps when
71 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000072
73*/
74
Guido van Rossumfd4b9572000-04-10 13:51:10 +000075#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000076
77/* Endianness switches; defaults to little endian */
78
79#ifdef WORDS_BIGENDIAN
80# define BYTEORDER_IS_BIG_ENDIAN
81#else
82# define BYTEORDER_IS_LITTLE_ENDIAN
83#endif
84
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000085/* --- Globals ------------------------------------------------------------
86
87 The globals are initialized by the _PyUnicode_Init() API and should
88 not be used before calling that API.
89
90*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000091
Anthony Baxterac6bd462006-04-13 02:06:09 +000092
93#ifdef __cplusplus
94extern "C" {
95#endif
96
Guido van Rossumd57fd912000-03-10 22:53:23 +000097/* Free list for Unicode objects */
Christian Heimes5b970ad2008-02-06 13:33:44 +000098static PyUnicodeObject *free_list;
99static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000100
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000101/* The empty Unicode object is shared to improve performance. */
102static PyUnicodeObject *unicode_empty;
103
104/* Single character Unicode strings in the Latin-1 range are being
105 shared as well. */
106static PyUnicodeObject *unicode_latin1[256];
107
Fred Drakee4315f52000-05-09 19:53:39 +0000108/* Default encoding to use and assume when NULL is passed as encoding
109 parameter; it is initialized by _PyUnicode_Init().
110
111 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000112 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000113
114*/
Fred Drakee4315f52000-05-09 19:53:39 +0000115static char unicode_default_encoding[100];
116
Christian Heimes4d4f2702008-01-30 11:32:37 +0000117/* Fast detection of the most frequent whitespace characters */
118const unsigned char _Py_ascii_whitespace[] = {
119 0, 0, 0, 0, 0, 0, 0, 0,
120// case 0x0009: /* HORIZONTAL TABULATION */
121// case 0x000A: /* LINE FEED */
122// case 0x000B: /* VERTICAL TABULATION */
123// case 0x000C: /* FORM FEED */
124// case 0x000D: /* CARRIAGE RETURN */
125 0, 1, 1, 1, 1, 1, 0, 0,
126 0, 0, 0, 0, 0, 0, 0, 0,
127// case 0x001C: /* FILE SEPARATOR */
128// case 0x001D: /* GROUP SEPARATOR */
129// case 0x001E: /* RECORD SEPARATOR */
130// case 0x001F: /* UNIT SEPARATOR */
131 0, 0, 0, 0, 1, 1, 1, 1,
132// case 0x0020: /* SPACE */
133 1, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, 0, 0, 0, 0,
137
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0
146};
147
148/* Same for linebreaks */
149static unsigned char ascii_linebreak[] = {
150 0, 0, 0, 0, 0, 0, 0, 0,
151// 0x000A, /* LINE FEED */
152// 0x000D, /* CARRIAGE RETURN */
153 0, 0, 1, 0, 0, 1, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0,
155// 0x001C, /* FILE SEPARATOR */
156// 0x001D, /* GROUP SEPARATOR */
157// 0x001E, /* RECORD SEPARATOR */
158 0, 0, 0, 0, 1, 1, 1, 0,
159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
161 0, 0, 0, 0, 0, 0, 0, 0,
162 0, 0, 0, 0, 0, 0, 0, 0,
163
164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0
172};
173
174
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000175Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000176PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000177{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000178#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000179 return 0x10FFFF;
180#else
181 /* This is actually an illegal character, so it should
182 not be passed to unichr. */
183 return 0xFFFF;
184#endif
185}
186
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000187/* --- Bloom Filters ----------------------------------------------------- */
188
189/* stuff to implement simple "bloom filters" for Unicode characters.
190 to keep things simple, we use a single bitmask, using the least 5
191 bits from each unicode characters as the bit index. */
192
193/* the linebreak mask is set up by Unicode_Init below */
194
195#define BLOOM_MASK unsigned long
196
197static BLOOM_MASK bloom_linebreak;
198
199#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
200
Christian Heimes4d4f2702008-01-30 11:32:37 +0000201#define BLOOM_LINEBREAK(ch) \
202 ((ch) < 128U ? ascii_linebreak[(ch)] : \
203 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000204
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000205Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000206{
207 /* calculate simple bloom-style bitmask for a given unicode string */
208
209 long mask;
210 Py_ssize_t i;
211
212 mask = 0;
213 for (i = 0; i < len; i++)
214 mask |= (1 << (ptr[i] & 0x1F));
215
216 return mask;
217}
218
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000219Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000220{
221 Py_ssize_t i;
222
223 for (i = 0; i < setlen; i++)
224 if (set[i] == chr)
225 return 1;
226
Fredrik Lundh77633512006-05-23 19:47:35 +0000227 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000228}
229
230#define BLOOM_MEMBER(mask, chr, set, setlen)\
231 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
232
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233/* --- Unicode Object ----------------------------------------------------- */
234
235static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000236int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000237 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000238{
239 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000240
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000241 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000243 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000244
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000245 /* Resizing shared object (unicode_empty or single character
246 objects) in-place is not allowed. Use PyUnicode_Resize()
247 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000248
Martin v. Löwis80d2e592006-04-13 06:06:08 +0000249 if (unicode == unicode_empty ||
250 (unicode->length == 1 &&
251 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000252 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000254 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000255 return -1;
256 }
257
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000258 /* We allocate one more byte to make sure the string is Ux0000 terminated.
259 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000260 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000261 it contains). */
262
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 oldstr = unicode->str;
Neal Norwitz419fd492008-03-17 20:22:43 +0000264 unicode->str = PyObject_REALLOC(unicode->str,
265 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 if (!unicode->str) {
Anthony Baxtera6286212006-04-11 07:42:36 +0000267 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000268 PyErr_NoMemory();
269 return -1;
270 }
271 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000272 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000273
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000274 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000275 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000276 if (unicode->defenc) {
277 Py_DECREF(unicode->defenc);
278 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000279 }
280 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000281
Guido van Rossumd57fd912000-03-10 22:53:23 +0000282 return 0;
283}
284
285/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000286 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000287
288 XXX This allocator could further be enhanced by assuring that the
289 free list never reduces its size below 1.
290
291*/
292
293static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000294PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000295{
296 register PyUnicodeObject *unicode;
297
Andrew Dalkee0df7622006-05-27 11:04:36 +0000298 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299 if (length == 0 && unicode_empty != NULL) {
300 Py_INCREF(unicode_empty);
301 return unicode_empty;
302 }
303
304 /* Unicode freelist & memory allocation */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000305 if (free_list) {
306 unicode = free_list;
307 free_list = *(PyUnicodeObject **)unicode;
308 numfree--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000309 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000310 /* Keep-Alive optimization: we only upsize the buffer,
311 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000312 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000313 unicode_resize(unicode, length) < 0) {
Neal Norwitz419fd492008-03-17 20:22:43 +0000314 PyObject_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000315 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000316 }
317 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000318 else {
Neal Norwitz419fd492008-03-17 20:22:43 +0000319 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
320 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000321 }
322 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000323 }
324 else {
Neal Norwitz419fd492008-03-17 20:22:43 +0000325 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000326 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000327 if (unicode == NULL)
328 return NULL;
Neal Norwitz419fd492008-03-17 20:22:43 +0000329 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
330 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000331 }
332
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000333 if (!unicode->str) {
334 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000335 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000336 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000337 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000338 * the caller fails before initializing str -- unicode_resize()
339 * reads str[0], and the Keep-Alive optimization can keep memory
340 * allocated for str alive across a call to unicode_dealloc(unicode).
341 * We don't want unicode_resize to read uninitialized memory in
342 * that case.
343 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000344 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000345 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000346 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000347 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000348 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000349 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000350
351 onError:
352 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000353 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000354 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000355}
356
357static
Guido van Rossum9475a232001-10-05 20:51:39 +0000358void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000359{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000360 if (PyUnicode_CheckExact(unicode) &&
Christian Heimes5b970ad2008-02-06 13:33:44 +0000361 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000362 /* Keep-Alive optimization */
363 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Neal Norwitz419fd492008-03-17 20:22:43 +0000364 PyObject_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000365 unicode->str = NULL;
366 unicode->length = 0;
367 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000368 if (unicode->defenc) {
369 Py_DECREF(unicode->defenc);
370 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000371 }
372 /* Add to free list */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000373 *(PyUnicodeObject **)unicode = free_list;
374 free_list = unicode;
375 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000376 }
377 else {
Neal Norwitz419fd492008-03-17 20:22:43 +0000378 PyObject_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000379 Py_XDECREF(unicode->defenc);
Christian Heimese93237d2007-12-19 02:37:44 +0000380 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000381 }
382}
383
Martin v. Löwis18e16552006-02-15 17:27:45 +0000384int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000385{
386 register PyUnicodeObject *v;
387
388 /* Argument checks */
389 if (unicode == NULL) {
390 PyErr_BadInternalCall();
391 return -1;
392 }
393 v = (PyUnicodeObject *)*unicode;
Christian Heimese93237d2007-12-19 02:37:44 +0000394 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000395 PyErr_BadInternalCall();
396 return -1;
397 }
398
399 /* Resizing unicode_empty and single character objects is not
400 possible since these are being shared. We simply return a fresh
401 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000402 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000403 (v == unicode_empty || v->length == 1)) {
404 PyUnicodeObject *w = _PyUnicode_New(length);
405 if (w == NULL)
406 return -1;
407 Py_UNICODE_COPY(w->str, v->str,
408 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000409 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000410 *unicode = (PyObject *)w;
411 return 0;
412 }
413
414 /* Note that we don't have to modify *unicode for unshared Unicode
415 objects, since we can modify them in-place. */
416 return unicode_resize(v, length);
417}
418
419/* Internal API for use in unicodeobject.c only ! */
420#define _PyUnicode_Resize(unicodevar, length) \
421 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
422
Guido van Rossumd57fd912000-03-10 22:53:23 +0000423PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000424 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000425{
426 PyUnicodeObject *unicode;
427
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000428 /* If the Unicode data is known at construction time, we can apply
429 some optimizations which share commonly used objects. */
430 if (u != NULL) {
431
432 /* Optimization for empty strings */
433 if (size == 0 && unicode_empty != NULL) {
434 Py_INCREF(unicode_empty);
435 return (PyObject *)unicode_empty;
436 }
437
438 /* Single character Unicode objects in the Latin-1 range are
439 shared when using this constructor */
440 if (size == 1 && *u < 256) {
441 unicode = unicode_latin1[*u];
442 if (!unicode) {
443 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000444 if (!unicode)
445 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000446 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000447 unicode_latin1[*u] = unicode;
448 }
449 Py_INCREF(unicode);
450 return (PyObject *)unicode;
451 }
452 }
Tim Petersced69f82003-09-16 20:30:58 +0000453
Guido van Rossumd57fd912000-03-10 22:53:23 +0000454 unicode = _PyUnicode_New(size);
455 if (!unicode)
456 return NULL;
457
458 /* Copy the Unicode data into the new object */
459 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000460 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000461
462 return (PyObject *)unicode;
463}
464
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000465PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
466{
467 PyUnicodeObject *unicode;
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000468
469 assert(size <= 0);
470 if (size < 0) {
471 PyErr_SetString(PyExc_SystemError,
472 "Negative size passed to PyUnicode_FromStringAndSize");
473 return NULL;
474 }
475
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000476 /* If the Unicode data is known at construction time, we can apply
477 some optimizations which share commonly used objects.
478 Also, this means the input must be UTF-8, so fall back to the
479 UTF-8 decoder at the end. */
480 if (u != NULL) {
481
482 /* Optimization for empty strings */
483 if (size == 0 && unicode_empty != NULL) {
484 Py_INCREF(unicode_empty);
485 return (PyObject *)unicode_empty;
486 }
487
488 /* Single characters are shared when using this constructor.
489 Restrict to ASCII, since the input must be UTF-8. */
490 if (size == 1 && Py_CHARMASK(*u) < 128) {
Neal Norwitzd183bdd2008-03-28 04:58:51 +0000491 unicode = unicode_latin1[Py_CHARMASK(*u)];
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000492 if (!unicode) {
493 unicode = _PyUnicode_New(1);
494 if (!unicode)
495 return NULL;
496 unicode->str[0] = Py_CHARMASK(*u);
Neal Norwitzd183bdd2008-03-28 04:58:51 +0000497 unicode_latin1[Py_CHARMASK(*u)] = unicode;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000498 }
499 Py_INCREF(unicode);
500 return (PyObject *)unicode;
501 }
502
503 return PyUnicode_DecodeUTF8(u, size, NULL);
504 }
505
506 unicode = _PyUnicode_New(size);
507 if (!unicode)
508 return NULL;
509
510 return (PyObject *)unicode;
511}
512
513PyObject *PyUnicode_FromString(const char *u)
514{
515 size_t size = strlen(u);
516 if (size > PY_SSIZE_T_MAX) {
517 PyErr_SetString(PyExc_OverflowError, "input too long");
518 return NULL;
519 }
520
521 return PyUnicode_FromStringAndSize(u, size);
522}
523
Guido van Rossumd57fd912000-03-10 22:53:23 +0000524#ifdef HAVE_WCHAR_H
525
526PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000527 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000528{
529 PyUnicodeObject *unicode;
530
531 if (w == NULL) {
532 PyErr_BadInternalCall();
533 return NULL;
534 }
535
536 unicode = _PyUnicode_New(size);
537 if (!unicode)
538 return NULL;
539
540 /* Copy the wchar_t data into the new object */
541#ifdef HAVE_USABLE_WCHAR_T
542 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000543#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000544 {
545 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000546 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000547 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000548 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000549 *u++ = *w++;
550 }
551#endif
552
553 return (PyObject *)unicode;
554}
555
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000556static void
557makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
558{
559 *fmt++ = '%';
560 if (width) {
561 if (zeropad)
562 *fmt++ = '0';
563 fmt += sprintf(fmt, "%d", width);
564 }
565 if (precision)
566 fmt += sprintf(fmt, ".%d", precision);
567 if (longflag)
568 *fmt++ = 'l';
569 else if (size_tflag) {
570 char *f = PY_FORMAT_SIZE_T;
571 while (*f)
572 *fmt++ = *f++;
573 }
574 *fmt++ = c;
575 *fmt = '\0';
576}
577
578#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
579
580PyObject *
581PyUnicode_FromFormatV(const char *format, va_list vargs)
582{
583 va_list count;
584 Py_ssize_t callcount = 0;
585 PyObject **callresults = NULL;
586 PyObject **callresult = NULL;
587 Py_ssize_t n = 0;
588 int width = 0;
589 int precision = 0;
590 int zeropad;
591 const char* f;
592 Py_UNICODE *s;
593 PyObject *string;
594 /* used by sprintf */
595 char buffer[21];
596 /* use abuffer instead of buffer, if we need more space
597 * (which can happen if there's a format specifier with width). */
598 char *abuffer = NULL;
599 char *realbuffer;
600 Py_ssize_t abuffersize = 0;
601 char fmt[60]; /* should be enough for %0width.precisionld */
602 const char *copy;
603
604#ifdef VA_LIST_IS_ARRAY
605 Py_MEMCPY(count, vargs, sizeof(va_list));
606#else
607#ifdef __va_copy
608 __va_copy(count, vargs);
609#else
610 count = vargs;
611#endif
612#endif
613 /* step 1: count the number of %S/%R format specifications
614 * (we call PyObject_Str()/PyObject_Repr() for these objects
615 * once during step 3 and put the result in an array) */
616 for (f = format; *f; f++) {
617 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
618 ++callcount;
619 }
620 /* step 2: allocate memory for the results of
621 * PyObject_Str()/PyObject_Repr() calls */
622 if (callcount) {
Neal Norwitz419fd492008-03-17 20:22:43 +0000623 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000624 if (!callresults) {
625 PyErr_NoMemory();
626 return NULL;
627 }
628 callresult = callresults;
629 }
630 /* step 3: figure out how large a buffer we need */
631 for (f = format; *f; f++) {
632 if (*f == '%') {
633 const char* p = f;
634 width = 0;
Neal Norwitzade57d02008-03-23 06:19:57 +0000635 while (isdigit((unsigned)*f))
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000636 width = (width*10) + *f++ - '0';
Neal Norwitzade57d02008-03-23 06:19:57 +0000637 while (*++f && *f != '%' && !isalpha((unsigned)*f))
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000638 ;
639
640 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
641 * they don't affect the amount of space we reserve.
642 */
643 if ((*f == 'l' || *f == 'z') &&
644 (f[1] == 'd' || f[1] == 'u'))
645 ++f;
646
647 switch (*f) {
648 case 'c':
649 (void)va_arg(count, int);
650 /* fall through... */
651 case '%':
652 n++;
653 break;
654 case 'd': case 'u': case 'i': case 'x':
655 (void) va_arg(count, int);
656 /* 20 bytes is enough to hold a 64-bit
657 integer. Decimal takes the most space.
658 This isn't enough for octal.
659 If a width is specified we need more
660 (which we allocate later). */
661 if (width < 20)
662 width = 20;
663 n += width;
664 if (abuffersize < width)
665 abuffersize = width;
666 break;
667 case 's':
668 {
669 /* UTF-8 */
670 unsigned char*s;
671 s = va_arg(count, unsigned char*);
672 while (*s) {
673 if (*s < 128) {
674 n++; s++;
675 } else if (*s < 0xc0) {
676 /* invalid UTF-8 */
677 n++; s++;
678 } else if (*s < 0xc0) {
679 n++;
680 s++; if(!*s)break;
681 s++;
682 } else if (*s < 0xe0) {
683 n++;
684 s++; if(!*s)break;
685 s++; if(!*s)break;
686 s++;
687 } else {
688 #ifdef Py_UNICODE_WIDE
689 n++;
690 #else
691 n+=2;
692 #endif
693 s++; if(!*s)break;
694 s++; if(!*s)break;
695 s++; if(!*s)break;
696 s++;
697 }
698 }
699 break;
700 }
701 case 'U':
702 {
703 PyObject *obj = va_arg(count, PyObject *);
704 assert(obj && PyUnicode_Check(obj));
705 n += PyUnicode_GET_SIZE(obj);
706 break;
707 }
708 case 'V':
709 {
710 PyObject *obj = va_arg(count, PyObject *);
711 const char *str = va_arg(count, const char *);
712 assert(obj || str);
713 assert(!obj || PyUnicode_Check(obj));
714 if (obj)
715 n += PyUnicode_GET_SIZE(obj);
716 else
717 n += strlen(str);
718 break;
719 }
720 case 'S':
721 {
722 PyObject *obj = va_arg(count, PyObject *);
723 PyObject *str;
724 assert(obj);
725 str = PyObject_Str(obj);
726 if (!str)
727 goto fail;
728 n += PyUnicode_GET_SIZE(str);
729 /* Remember the str and switch to the next slot */
730 *callresult++ = str;
731 break;
732 }
733 case 'R':
734 {
735 PyObject *obj = va_arg(count, PyObject *);
736 PyObject *repr;
737 assert(obj);
738 repr = PyObject_Repr(obj);
739 if (!repr)
740 goto fail;
741 n += PyUnicode_GET_SIZE(repr);
742 /* Remember the repr and switch to the next slot */
743 *callresult++ = repr;
744 break;
745 }
746 case 'p':
747 (void) va_arg(count, int);
748 /* maximum 64-bit pointer representation:
749 * 0xffffffffffffffff
750 * so 19 characters is enough.
751 * XXX I count 18 -- what's the extra for?
752 */
753 n += 19;
754 break;
755 default:
756 /* if we stumble upon an unknown
757 formatting code, copy the rest of
758 the format string to the output
759 string. (we cannot just skip the
760 code, since there's no way to know
761 what's in the argument list) */
762 n += strlen(p);
763 goto expand;
764 }
765 } else
766 n++;
767 }
768 expand:
769 if (abuffersize > 20) {
Neal Norwitz419fd492008-03-17 20:22:43 +0000770 abuffer = PyObject_Malloc(abuffersize);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000771 if (!abuffer) {
772 PyErr_NoMemory();
773 goto fail;
774 }
775 realbuffer = abuffer;
776 }
777 else
778 realbuffer = buffer;
779 /* step 4: fill the buffer */
780 /* Since we've analyzed how much space we need for the worst case,
781 we don't have to resize the string.
782 There can be no errors beyond this point. */
783 string = PyUnicode_FromUnicode(NULL, n);
784 if (!string)
785 goto fail;
786
787 s = PyUnicode_AS_UNICODE(string);
788 callresult = callresults;
789
790 for (f = format; *f; f++) {
791 if (*f == '%') {
792 const char* p = f++;
793 int longflag = 0;
794 int size_tflag = 0;
795 zeropad = (*f == '0');
796 /* parse the width.precision part */
797 width = 0;
Neal Norwitzade57d02008-03-23 06:19:57 +0000798 while (isdigit((unsigned)*f))
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000799 width = (width*10) + *f++ - '0';
800 precision = 0;
801 if (*f == '.') {
802 f++;
Neal Norwitzade57d02008-03-23 06:19:57 +0000803 while (isdigit((unsigned)*f))
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000804 precision = (precision*10) + *f++ - '0';
805 }
806 /* handle the long flag, but only for %ld and %lu.
807 others can be added when necessary. */
808 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
809 longflag = 1;
810 ++f;
811 }
812 /* handle the size_t flag. */
813 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
814 size_tflag = 1;
815 ++f;
816 }
817
818 switch (*f) {
819 case 'c':
820 *s++ = va_arg(vargs, int);
821 break;
822 case 'd':
823 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
824 if (longflag)
825 sprintf(realbuffer, fmt, va_arg(vargs, long));
826 else if (size_tflag)
827 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
828 else
829 sprintf(realbuffer, fmt, va_arg(vargs, int));
830 appendstring(realbuffer);
831 break;
832 case 'u':
833 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
834 if (longflag)
835 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
836 else if (size_tflag)
837 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
838 else
839 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
840 appendstring(realbuffer);
841 break;
842 case 'i':
843 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
844 sprintf(realbuffer, fmt, va_arg(vargs, int));
845 appendstring(realbuffer);
846 break;
847 case 'x':
848 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
849 sprintf(realbuffer, fmt, va_arg(vargs, int));
850 appendstring(realbuffer);
851 break;
852 case 's':
853 {
854 /* Parameter must be UTF-8 encoded.
855 In case of encoding errors, use
856 the replacement character. */
857 PyObject *u;
858 p = va_arg(vargs, char*);
859 u = PyUnicode_DecodeUTF8(p, strlen(p),
860 "replace");
861 if (!u)
862 goto fail;
863 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
864 PyUnicode_GET_SIZE(u));
865 s += PyUnicode_GET_SIZE(u);
866 Py_DECREF(u);
867 break;
868 }
869 case 'U':
870 {
871 PyObject *obj = va_arg(vargs, PyObject *);
872 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
873 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
874 s += size;
875 break;
876 }
877 case 'V':
878 {
879 PyObject *obj = va_arg(vargs, PyObject *);
880 const char *str = va_arg(vargs, const char *);
881 if (obj) {
882 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
883 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
884 s += size;
885 } else {
886 appendstring(str);
887 }
888 break;
889 }
890 case 'S':
891 case 'R':
892 {
893 Py_UNICODE *ucopy;
894 Py_ssize_t usize;
895 Py_ssize_t upos;
896 /* unused, since we already have the result */
897 (void) va_arg(vargs, PyObject *);
898 ucopy = PyUnicode_AS_UNICODE(*callresult);
899 usize = PyUnicode_GET_SIZE(*callresult);
900 for (upos = 0; upos<usize;)
901 *s++ = ucopy[upos++];
902 /* We're done with the unicode()/repr() => forget it */
903 Py_DECREF(*callresult);
904 /* switch to next unicode()/repr() result */
905 ++callresult;
906 break;
907 }
908 case 'p':
909 sprintf(buffer, "%p", va_arg(vargs, void*));
910 /* %p is ill-defined: ensure leading 0x. */
911 if (buffer[1] == 'X')
912 buffer[1] = 'x';
913 else if (buffer[1] != 'x') {
914 memmove(buffer+2, buffer, strlen(buffer)+1);
915 buffer[0] = '0';
916 buffer[1] = 'x';
917 }
918 appendstring(buffer);
919 break;
920 case '%':
921 *s++ = '%';
922 break;
923 default:
924 appendstring(p);
925 goto end;
926 }
927 } else
928 *s++ = *f;
929 }
930
931 end:
932 if (callresults)
Neal Norwitz419fd492008-03-17 20:22:43 +0000933 PyObject_Free(callresults);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000934 if (abuffer)
Neal Norwitz419fd492008-03-17 20:22:43 +0000935 PyObject_Free(abuffer);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000936 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
937 return string;
938 fail:
939 if (callresults) {
940 PyObject **callresult2 = callresults;
941 while (callresult2 < callresult) {
942 Py_DECREF(*callresult2);
943 ++callresult2;
944 }
Neal Norwitz419fd492008-03-17 20:22:43 +0000945 PyObject_Free(callresults);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000946 }
947 if (abuffer)
Neal Norwitz419fd492008-03-17 20:22:43 +0000948 PyObject_Free(abuffer);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000949 return NULL;
950}
951
952#undef appendstring
953
954PyObject *
955PyUnicode_FromFormat(const char *format, ...)
956{
957 PyObject* ret;
958 va_list vargs;
959
960#ifdef HAVE_STDARG_PROTOTYPES
961 va_start(vargs, format);
962#else
963 va_start(vargs);
964#endif
965 ret = PyUnicode_FromFormatV(format, vargs);
966 va_end(vargs);
967 return ret;
968}
969
Martin v. Löwis18e16552006-02-15 17:27:45 +0000970Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
971 wchar_t *w,
972 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000973{
974 if (unicode == NULL) {
975 PyErr_BadInternalCall();
976 return -1;
977 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000978
979 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000980 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000981 size = PyUnicode_GET_SIZE(unicode) + 1;
982
Guido van Rossumd57fd912000-03-10 22:53:23 +0000983#ifdef HAVE_USABLE_WCHAR_T
984 memcpy(w, unicode->str, size * sizeof(wchar_t));
985#else
986 {
987 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000988 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000989 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000990 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000991 *w++ = *u++;
992 }
993#endif
994
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000995 if (size > PyUnicode_GET_SIZE(unicode))
996 return PyUnicode_GET_SIZE(unicode);
997 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000998 return size;
999}
1000
1001#endif
1002
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001003PyObject *PyUnicode_FromOrdinal(int ordinal)
1004{
Hye-Shik Chang40574832004-04-06 07:24:51 +00001005 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001006
1007#ifdef Py_UNICODE_WIDE
1008 if (ordinal < 0 || ordinal > 0x10ffff) {
1009 PyErr_SetString(PyExc_ValueError,
1010 "unichr() arg not in range(0x110000) "
1011 "(wide Python build)");
1012 return NULL;
1013 }
1014#else
1015 if (ordinal < 0 || ordinal > 0xffff) {
1016 PyErr_SetString(PyExc_ValueError,
1017 "unichr() arg not in range(0x10000) "
1018 "(narrow Python build)");
1019 return NULL;
1020 }
1021#endif
1022
Hye-Shik Chang40574832004-04-06 07:24:51 +00001023 s[0] = (Py_UNICODE)ordinal;
1024 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001025}
1026
Guido van Rossumd57fd912000-03-10 22:53:23 +00001027PyObject *PyUnicode_FromObject(register PyObject *obj)
1028{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001029 /* XXX Perhaps we should make this API an alias of
1030 PyObject_Unicode() instead ?! */
1031 if (PyUnicode_CheckExact(obj)) {
1032 Py_INCREF(obj);
1033 return obj;
1034 }
1035 if (PyUnicode_Check(obj)) {
1036 /* For a Unicode subtype that's not a Unicode object,
1037 return a true Unicode object with the same data. */
1038 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1039 PyUnicode_GET_SIZE(obj));
1040 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001041 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1042}
1043
1044PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1045 const char *encoding,
1046 const char *errors)
1047{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001048 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001049 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001050 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001051
Guido van Rossumd57fd912000-03-10 22:53:23 +00001052 if (obj == NULL) {
1053 PyErr_BadInternalCall();
1054 return NULL;
1055 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001056
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001057#if 0
1058 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001059 that no encodings is given and then redirect to
1060 PyObject_Unicode() which then applies the additional logic for
1061 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001062
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001063 NOTE: This API should really only be used for object which
1064 represent *encoded* Unicode !
1065
1066 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001067 if (PyUnicode_Check(obj)) {
1068 if (encoding) {
1069 PyErr_SetString(PyExc_TypeError,
1070 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001071 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001072 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001073 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001074 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001075#else
1076 if (PyUnicode_Check(obj)) {
1077 PyErr_SetString(PyExc_TypeError,
1078 "decoding Unicode is not supported");
1079 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001080 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001081#endif
1082
1083 /* Coerce object */
1084 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001085 s = PyString_AS_STRING(obj);
1086 len = PyString_GET_SIZE(obj);
Christian Heimes1a6387e2008-03-26 12:49:49 +00001087 }
1088 else if (PyBytes_Check(obj)) {
1089 /* Python 2.x specific */
1090 PyErr_Format(PyExc_TypeError,
1091 "decoding bytearray is not supported");
1092 return NULL;
1093 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001094 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1095 /* Overwrite the error message with something more useful in
1096 case of a TypeError. */
1097 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001098 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001099 "coercing to Unicode: need string or buffer, "
1100 "%.80s found",
Christian Heimese93237d2007-12-19 02:37:44 +00001101 Py_TYPE(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001102 goto onError;
1103 }
Tim Petersced69f82003-09-16 20:30:58 +00001104
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001105 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001106 if (len == 0) {
1107 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001108 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001109 }
Tim Petersced69f82003-09-16 20:30:58 +00001110 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001111 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001112
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001113 return v;
1114
1115 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001116 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001117}
1118
1119PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001120 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001121 const char *encoding,
1122 const char *errors)
1123{
1124 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001125
1126 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001127 encoding = PyUnicode_GetDefaultEncoding();
1128
1129 /* Shortcuts for common default encodings */
1130 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001131 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001132 else if (strcmp(encoding, "latin-1") == 0)
1133 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001134#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1135 else if (strcmp(encoding, "mbcs") == 0)
1136 return PyUnicode_DecodeMBCS(s, size, errors);
1137#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001138 else if (strcmp(encoding, "ascii") == 0)
1139 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001140
1141 /* Decode via the codec registry */
1142 buffer = PyBuffer_FromMemory((void *)s, size);
1143 if (buffer == NULL)
1144 goto onError;
1145 unicode = PyCodec_Decode(buffer, encoding, errors);
1146 if (unicode == NULL)
1147 goto onError;
1148 if (!PyUnicode_Check(unicode)) {
1149 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001150 "decoder did not return an unicode object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001151 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001152 Py_DECREF(unicode);
1153 goto onError;
1154 }
1155 Py_DECREF(buffer);
1156 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001157
Guido van Rossumd57fd912000-03-10 22:53:23 +00001158 onError:
1159 Py_XDECREF(buffer);
1160 return NULL;
1161}
1162
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001163PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1164 const char *encoding,
1165 const char *errors)
1166{
1167 PyObject *v;
1168
1169 if (!PyUnicode_Check(unicode)) {
1170 PyErr_BadArgument();
1171 goto onError;
1172 }
1173
1174 if (encoding == NULL)
1175 encoding = PyUnicode_GetDefaultEncoding();
1176
1177 /* Decode via the codec registry */
1178 v = PyCodec_Decode(unicode, encoding, errors);
1179 if (v == NULL)
1180 goto onError;
1181 return v;
1182
1183 onError:
1184 return NULL;
1185}
1186
Guido van Rossumd57fd912000-03-10 22:53:23 +00001187PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001188 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001189 const char *encoding,
1190 const char *errors)
1191{
1192 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001193
Guido van Rossumd57fd912000-03-10 22:53:23 +00001194 unicode = PyUnicode_FromUnicode(s, size);
1195 if (unicode == NULL)
1196 return NULL;
1197 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1198 Py_DECREF(unicode);
1199 return v;
1200}
1201
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001202PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1203 const char *encoding,
1204 const char *errors)
1205{
1206 PyObject *v;
1207
1208 if (!PyUnicode_Check(unicode)) {
1209 PyErr_BadArgument();
1210 goto onError;
1211 }
1212
1213 if (encoding == NULL)
1214 encoding = PyUnicode_GetDefaultEncoding();
1215
1216 /* Encode via the codec registry */
1217 v = PyCodec_Encode(unicode, encoding, errors);
1218 if (v == NULL)
1219 goto onError;
1220 return v;
1221
1222 onError:
1223 return NULL;
1224}
1225
Guido van Rossumd57fd912000-03-10 22:53:23 +00001226PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1227 const char *encoding,
1228 const char *errors)
1229{
1230 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001231
Guido van Rossumd57fd912000-03-10 22:53:23 +00001232 if (!PyUnicode_Check(unicode)) {
1233 PyErr_BadArgument();
1234 goto onError;
1235 }
Fred Drakee4315f52000-05-09 19:53:39 +00001236
Tim Petersced69f82003-09-16 20:30:58 +00001237 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001238 encoding = PyUnicode_GetDefaultEncoding();
1239
1240 /* Shortcuts for common default encodings */
1241 if (errors == NULL) {
1242 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001243 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001244 else if (strcmp(encoding, "latin-1") == 0)
1245 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001246#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1247 else if (strcmp(encoding, "mbcs") == 0)
1248 return PyUnicode_AsMBCSString(unicode);
1249#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001250 else if (strcmp(encoding, "ascii") == 0)
1251 return PyUnicode_AsASCIIString(unicode);
1252 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001253
1254 /* Encode via the codec registry */
1255 v = PyCodec_Encode(unicode, encoding, errors);
1256 if (v == NULL)
1257 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001258 if (!PyString_Check(v)) {
1259 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001260 "encoder did not return a string object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001261 Py_TYPE(v)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001262 Py_DECREF(v);
1263 goto onError;
1264 }
1265 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001266
Guido van Rossumd57fd912000-03-10 22:53:23 +00001267 onError:
1268 return NULL;
1269}
1270
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001271PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1272 const char *errors)
1273{
1274 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1275
1276 if (v)
1277 return v;
1278 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1279 if (v && errors == NULL)
1280 ((PyUnicodeObject *)unicode)->defenc = v;
1281 return v;
1282}
1283
Guido van Rossumd57fd912000-03-10 22:53:23 +00001284Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1285{
1286 if (!PyUnicode_Check(unicode)) {
1287 PyErr_BadArgument();
1288 goto onError;
1289 }
1290 return PyUnicode_AS_UNICODE(unicode);
1291
1292 onError:
1293 return NULL;
1294}
1295
Martin v. Löwis18e16552006-02-15 17:27:45 +00001296Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001297{
1298 if (!PyUnicode_Check(unicode)) {
1299 PyErr_BadArgument();
1300 goto onError;
1301 }
1302 return PyUnicode_GET_SIZE(unicode);
1303
1304 onError:
1305 return -1;
1306}
1307
Thomas Wouters78890102000-07-22 19:25:51 +00001308const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001309{
1310 return unicode_default_encoding;
1311}
1312
1313int PyUnicode_SetDefaultEncoding(const char *encoding)
1314{
1315 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001316
Fred Drakee4315f52000-05-09 19:53:39 +00001317 /* Make sure the encoding is valid. As side effect, this also
1318 loads the encoding into the codec registry cache. */
1319 v = _PyCodec_Lookup(encoding);
1320 if (v == NULL)
1321 goto onError;
1322 Py_DECREF(v);
1323 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +00001324 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +00001325 sizeof(unicode_default_encoding));
1326 return 0;
1327
1328 onError:
1329 return -1;
1330}
1331
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001332/* error handling callback helper:
1333 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001334 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001335 and adjust various state variables.
1336 return 0 on success, -1 on error
1337*/
1338
1339static
1340int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1341 const char *encoding, const char *reason,
Walter Dörwald87578782007-08-30 15:30:09 +00001342 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1343 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001344 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001345{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001346 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001347
1348 PyObject *restuple = NULL;
1349 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001350 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1351 Py_ssize_t requiredsize;
1352 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001353 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001354 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001355 int res = -1;
1356
1357 if (*errorHandler == NULL) {
1358 *errorHandler = PyCodec_LookupError(errors);
1359 if (*errorHandler == NULL)
1360 goto onError;
1361 }
1362
1363 if (*exceptionObject == NULL) {
1364 *exceptionObject = PyUnicodeDecodeError_Create(
1365 encoding, input, insize, *startinpos, *endinpos, reason);
1366 if (*exceptionObject == NULL)
1367 goto onError;
1368 }
1369 else {
1370 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1371 goto onError;
1372 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1373 goto onError;
1374 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1375 goto onError;
1376 }
1377
1378 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1379 if (restuple == NULL)
1380 goto onError;
1381 if (!PyTuple_Check(restuple)) {
1382 PyErr_Format(PyExc_TypeError, &argparse[4]);
1383 goto onError;
1384 }
1385 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1386 goto onError;
1387 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001388 newpos = insize+newpos;
1389 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001390 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001391 goto onError;
1392 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001393
1394 /* need more space? (at least enough for what we
1395 have+the replacement+the rest of the string (starting
1396 at the new input position), so we won't have to check space
1397 when there are no errors in the rest of the string) */
1398 repptr = PyUnicode_AS_UNICODE(repunicode);
1399 repsize = PyUnicode_GET_SIZE(repunicode);
1400 requiredsize = *outpos + repsize + insize-newpos;
1401 if (requiredsize > outsize) {
1402 if (requiredsize<2*outsize)
1403 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001404 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001405 goto onError;
1406 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1407 }
1408 *endinpos = newpos;
1409 *inptr = input + newpos;
1410 Py_UNICODE_COPY(*outptr, repptr, repsize);
1411 *outptr += repsize;
1412 *outpos += repsize;
1413 /* we made it! */
1414 res = 0;
1415
1416 onError:
1417 Py_XDECREF(restuple);
1418 return res;
1419}
1420
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001421/* --- UTF-7 Codec -------------------------------------------------------- */
1422
1423/* see RFC2152 for details */
1424
Tim Petersced69f82003-09-16 20:30:58 +00001425static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001426char utf7_special[128] = {
1427 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1428 encoded:
1429 0 - not special
1430 1 - special
1431 2 - whitespace (optional)
1432 3 - RFC2152 Set O (optional) */
1433 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1434 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1435 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1436 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1437 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1438 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1439 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1440 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1441
1442};
1443
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001444/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1445 warnings about the comparison always being false; since
1446 utf7_special[0] is 1, we can safely make that one comparison
1447 true */
1448
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001449#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001450 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001451 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001452 (encodeO && (utf7_special[(c)] == 3)))
1453
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001454#define B64(n) \
1455 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1456#define B64CHAR(c) \
1457 (isalnum(c) || (c) == '+' || (c) == '/')
1458#define UB64(c) \
1459 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1460 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001461
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001462#define ENCODE(out, ch, bits) \
1463 while (bits >= 6) { \
1464 *out++ = B64(ch >> (bits-6)); \
1465 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001466 }
1467
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001468#define DECODE(out, ch, bits, surrogate) \
1469 while (bits >= 16) { \
1470 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1471 bits -= 16; \
1472 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001473 /* We have already generated an error for the high surrogate \
1474 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001475 surrogate = 0; \
1476 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001477 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001478 it in a 16-bit character */ \
1479 surrogate = 1; \
1480 errmsg = "code pairs are not supported"; \
1481 goto utf7Error; \
1482 } else { \
1483 *out++ = outCh; \
1484 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001485 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001486
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001487PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001488 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001489 const char *errors)
1490{
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001491 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1492}
1493
1494PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1495 Py_ssize_t size,
1496 const char *errors,
1497 Py_ssize_t *consumed)
1498{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001499 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001500 Py_ssize_t startinpos;
1501 Py_ssize_t endinpos;
1502 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001503 const char *e;
1504 PyUnicodeObject *unicode;
1505 Py_UNICODE *p;
1506 const char *errmsg = "";
1507 int inShift = 0;
1508 unsigned int bitsleft = 0;
1509 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001510 int surrogate = 0;
1511 PyObject *errorHandler = NULL;
1512 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001513
1514 unicode = _PyUnicode_New(size);
1515 if (!unicode)
1516 return NULL;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001517 if (size == 0) {
1518 if (consumed)
1519 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001520 return (PyObject *)unicode;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001521 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001522
1523 p = unicode->str;
1524 e = s + size;
1525
1526 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001527 Py_UNICODE ch;
1528 restart:
1529 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001530
1531 if (inShift) {
1532 if ((ch == '-') || !B64CHAR(ch)) {
1533 inShift = 0;
1534 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001535
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001536 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1537 if (bitsleft >= 6) {
1538 /* The shift sequence has a partial character in it. If
1539 bitsleft < 6 then we could just classify it as padding
1540 but that is not the case here */
1541
1542 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001543 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001544 }
1545 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001546 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001547 here so indicate the potential of a misencoded character. */
1548
1549 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1550 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1551 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001552 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001553 }
1554
1555 if (ch == '-') {
1556 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001557 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001558 inShift = 1;
1559 }
1560 } else if (SPECIAL(ch,0,0)) {
1561 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001562 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001563 } else {
1564 *p++ = ch;
1565 }
1566 } else {
1567 charsleft = (charsleft << 6) | UB64(ch);
1568 bitsleft += 6;
1569 s++;
1570 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1571 }
1572 }
1573 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001574 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001575 s++;
1576 if (s < e && *s == '-') {
1577 s++;
1578 *p++ = '+';
1579 } else
1580 {
1581 inShift = 1;
1582 bitsleft = 0;
1583 }
1584 }
1585 else if (SPECIAL(ch,0,0)) {
Walter Dörwald9d045422007-08-30 15:34:55 +00001586 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001587 errmsg = "unexpected special character";
1588 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001589 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001590 }
1591 else {
1592 *p++ = ch;
1593 s++;
1594 }
1595 continue;
1596 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001597 outpos = p-PyUnicode_AS_UNICODE(unicode);
1598 endinpos = s-starts;
1599 if (unicode_decode_call_errorhandler(
1600 errors, &errorHandler,
1601 "utf7", errmsg,
1602 starts, size, &startinpos, &endinpos, &exc, &s,
1603 (PyObject **)&unicode, &outpos, &p))
1604 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001605 }
1606
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001607 if (inShift && !consumed) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001608 outpos = p-PyUnicode_AS_UNICODE(unicode);
1609 endinpos = size;
1610 if (unicode_decode_call_errorhandler(
1611 errors, &errorHandler,
1612 "utf7", "unterminated shift sequence",
1613 starts, size, &startinpos, &endinpos, &exc, &s,
1614 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001615 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001616 if (s < e)
1617 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001618 }
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001619 if (consumed) {
1620 if(inShift)
1621 *consumed = startinpos;
1622 else
1623 *consumed = s-starts;
1624 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001625
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001626 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001627 goto onError;
1628
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001629 Py_XDECREF(errorHandler);
1630 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001631 return (PyObject *)unicode;
1632
1633onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001634 Py_XDECREF(errorHandler);
1635 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001636 Py_DECREF(unicode);
1637 return NULL;
1638}
1639
1640
1641PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001642 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001643 int encodeSetO,
1644 int encodeWhiteSpace,
1645 const char *errors)
1646{
1647 PyObject *v;
1648 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001649 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001650 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001651 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001652 unsigned int bitsleft = 0;
1653 unsigned long charsleft = 0;
1654 char * out;
1655 char * start;
1656
1657 if (size == 0)
1658 return PyString_FromStringAndSize(NULL, 0);
1659
1660 v = PyString_FromStringAndSize(NULL, cbAllocated);
1661 if (v == NULL)
1662 return NULL;
1663
1664 start = out = PyString_AS_STRING(v);
1665 for (;i < size; ++i) {
1666 Py_UNICODE ch = s[i];
1667
1668 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001669 if (ch == '+') {
1670 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001671 *out++ = '-';
1672 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1673 charsleft = ch;
1674 bitsleft = 16;
1675 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001676 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001677 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001678 } else {
1679 *out++ = (char) ch;
1680 }
1681 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001682 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1683 *out++ = B64(charsleft << (6-bitsleft));
1684 charsleft = 0;
1685 bitsleft = 0;
1686 /* Characters not in the BASE64 set implicitly unshift the sequence
1687 so no '-' is required, except if the character is itself a '-' */
1688 if (B64CHAR(ch) || ch == '-') {
1689 *out++ = '-';
1690 }
1691 inShift = 0;
1692 *out++ = (char) ch;
1693 } else {
1694 bitsleft += 16;
1695 charsleft = (charsleft << 16) | ch;
1696 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1697
1698 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001699 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001700 or '-' then the shift sequence will be terminated implicitly and we
1701 don't have to insert a '-'. */
1702
1703 if (bitsleft == 0) {
1704 if (i + 1 < size) {
1705 Py_UNICODE ch2 = s[i+1];
1706
1707 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001708
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001709 } else if (B64CHAR(ch2) || ch2 == '-') {
1710 *out++ = '-';
1711 inShift = 0;
1712 } else {
1713 inShift = 0;
1714 }
1715
1716 }
1717 else {
1718 *out++ = '-';
1719 inShift = 0;
1720 }
1721 }
Tim Petersced69f82003-09-16 20:30:58 +00001722 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001723 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001724 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001725 if (bitsleft) {
1726 *out++= B64(charsleft << (6-bitsleft) );
1727 *out++ = '-';
1728 }
1729
Tim Peters5de98422002-04-27 18:44:32 +00001730 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001731 return v;
1732}
1733
1734#undef SPECIAL
1735#undef B64
1736#undef B64CHAR
1737#undef UB64
1738#undef ENCODE
1739#undef DECODE
1740
Guido van Rossumd57fd912000-03-10 22:53:23 +00001741/* --- UTF-8 Codec -------------------------------------------------------- */
1742
Tim Petersced69f82003-09-16 20:30:58 +00001743static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001744char utf8_code_length[256] = {
1745 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1746 illegal prefix. see RFC 2279 for details */
1747 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1748 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1749 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1750 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1751 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1752 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1753 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1754 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1755 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1756 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1757 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1758 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1759 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1760 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1761 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1762 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1763};
1764
Guido van Rossumd57fd912000-03-10 22:53:23 +00001765PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001766 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001767 const char *errors)
1768{
Walter Dörwald69652032004-09-07 20:24:22 +00001769 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1770}
1771
1772PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001773 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001774 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001775 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001776{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001777 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001778 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001779 Py_ssize_t startinpos;
1780 Py_ssize_t endinpos;
1781 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001782 const char *e;
1783 PyUnicodeObject *unicode;
1784 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001785 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001786 PyObject *errorHandler = NULL;
1787 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001788
1789 /* Note: size will always be longer than the resulting Unicode
1790 character count */
1791 unicode = _PyUnicode_New(size);
1792 if (!unicode)
1793 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001794 if (size == 0) {
1795 if (consumed)
1796 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001797 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001798 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001799
1800 /* Unpack UTF-8 encoded data */
1801 p = unicode->str;
1802 e = s + size;
1803
1804 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001805 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001806
1807 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001808 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001809 s++;
1810 continue;
1811 }
1812
1813 n = utf8_code_length[ch];
1814
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001815 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001816 if (consumed)
1817 break;
1818 else {
1819 errmsg = "unexpected end of data";
1820 startinpos = s-starts;
1821 endinpos = size;
1822 goto utf8Error;
1823 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001824 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001825
1826 switch (n) {
1827
1828 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001829 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001830 startinpos = s-starts;
1831 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001832 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001833
1834 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001835 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001836 startinpos = s-starts;
1837 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001838 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001839
1840 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001841 if ((s[1] & 0xc0) != 0x80) {
1842 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001843 startinpos = s-starts;
1844 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001845 goto utf8Error;
1846 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001847 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001848 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001849 startinpos = s-starts;
1850 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001851 errmsg = "illegal encoding";
1852 goto utf8Error;
1853 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001854 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001855 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001856 break;
1857
1858 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001859 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001860 (s[2] & 0xc0) != 0x80) {
1861 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001862 startinpos = s-starts;
1863 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001864 goto utf8Error;
1865 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001866 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001867 if (ch < 0x0800) {
1868 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001869 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001870
1871 XXX For wide builds (UCS-4) we should probably try
1872 to recombine the surrogates into a single code
1873 unit.
1874 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001875 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001876 startinpos = s-starts;
1877 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001878 goto utf8Error;
1879 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001880 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001881 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001882 break;
1883
1884 case 4:
1885 if ((s[1] & 0xc0) != 0x80 ||
1886 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001887 (s[3] & 0xc0) != 0x80) {
1888 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001889 startinpos = s-starts;
1890 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001891 goto utf8Error;
1892 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001893 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1894 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1895 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001896 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001897 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001898 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001899 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001900 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001901 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001902 startinpos = s-starts;
1903 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001904 goto utf8Error;
1905 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001906#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001907 *p++ = (Py_UNICODE)ch;
1908#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001909 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001910
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001911 /* translate from 10000..10FFFF to 0..FFFF */
1912 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001913
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001914 /* high surrogate = top 10 bits added to D800 */
1915 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001916
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001917 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001918 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001919#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001920 break;
1921
1922 default:
1923 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001924 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001925 startinpos = s-starts;
1926 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001927 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001928 }
1929 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001930 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001931
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001932 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001933 outpos = p-PyUnicode_AS_UNICODE(unicode);
1934 if (unicode_decode_call_errorhandler(
1935 errors, &errorHandler,
1936 "utf8", errmsg,
1937 starts, size, &startinpos, &endinpos, &exc, &s,
1938 (PyObject **)&unicode, &outpos, &p))
1939 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001940 }
Walter Dörwald69652032004-09-07 20:24:22 +00001941 if (consumed)
1942 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001943
1944 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001945 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001946 goto onError;
1947
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001948 Py_XDECREF(errorHandler);
1949 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001950 return (PyObject *)unicode;
1951
1952onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001953 Py_XDECREF(errorHandler);
1954 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001955 Py_DECREF(unicode);
1956 return NULL;
1957}
1958
Tim Peters602f7402002-04-27 18:03:26 +00001959/* Allocation strategy: if the string is short, convert into a stack buffer
1960 and allocate exactly as much space needed at the end. Else allocate the
1961 maximum possible needed (4 result bytes per Unicode character), and return
1962 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001963*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001964PyObject *
1965PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001966 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001967 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001968{
Tim Peters602f7402002-04-27 18:03:26 +00001969#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001970
Martin v. Löwis18e16552006-02-15 17:27:45 +00001971 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001972 PyObject *v; /* result string object */
1973 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001974 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001975 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001976 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001977
Tim Peters602f7402002-04-27 18:03:26 +00001978 assert(s != NULL);
1979 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001980
Tim Peters602f7402002-04-27 18:03:26 +00001981 if (size <= MAX_SHORT_UNICHARS) {
1982 /* Write into the stack buffer; nallocated can't overflow.
1983 * At the end, we'll allocate exactly as much heap space as it
1984 * turns out we need.
1985 */
1986 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1987 v = NULL; /* will allocate after we're done */
1988 p = stackbuf;
1989 }
1990 else {
1991 /* Overallocate on the heap, and give the excess back at the end. */
1992 nallocated = size * 4;
1993 if (nallocated / 4 != size) /* overflow! */
1994 return PyErr_NoMemory();
1995 v = PyString_FromStringAndSize(NULL, nallocated);
1996 if (v == NULL)
1997 return NULL;
1998 p = PyString_AS_STRING(v);
1999 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002000
Tim Peters602f7402002-04-27 18:03:26 +00002001 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002002 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002003
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002004 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002005 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002006 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002007
Guido van Rossumd57fd912000-03-10 22:53:23 +00002008 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002009 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002010 *p++ = (char)(0xc0 | (ch >> 6));
2011 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002012 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002013 else {
Tim Peters602f7402002-04-27 18:03:26 +00002014 /* Encode UCS2 Unicode ordinals */
2015 if (ch < 0x10000) {
2016 /* Special case: check for high surrogate */
2017 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2018 Py_UCS4 ch2 = s[i];
2019 /* Check for low surrogate and combine the two to
2020 form a UCS4 value */
2021 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002022 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002023 i++;
2024 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002025 }
Tim Peters602f7402002-04-27 18:03:26 +00002026 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002027 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002028 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002029 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2030 *p++ = (char)(0x80 | (ch & 0x3f));
2031 continue;
2032 }
2033encodeUCS4:
2034 /* Encode UCS4 Unicode ordinals */
2035 *p++ = (char)(0xf0 | (ch >> 18));
2036 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2037 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2038 *p++ = (char)(0x80 | (ch & 0x3f));
2039 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002040 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002041
Tim Peters602f7402002-04-27 18:03:26 +00002042 if (v == NULL) {
2043 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002044 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002045 assert(nneeded <= nallocated);
2046 v = PyString_FromStringAndSize(stackbuf, nneeded);
2047 }
2048 else {
2049 /* Cut back to size actually needed. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002050 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002051 assert(nneeded <= nallocated);
2052 _PyString_Resize(&v, nneeded);
2053 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002054 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002055
Tim Peters602f7402002-04-27 18:03:26 +00002056#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002057}
2058
Guido van Rossumd57fd912000-03-10 22:53:23 +00002059PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2060{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002061 if (!PyUnicode_Check(unicode)) {
2062 PyErr_BadArgument();
2063 return NULL;
2064 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002065 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2066 PyUnicode_GET_SIZE(unicode),
2067 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002068}
2069
Walter Dörwald6e390802007-08-17 16:41:28 +00002070/* --- UTF-32 Codec ------------------------------------------------------- */
2071
2072PyObject *
2073PyUnicode_DecodeUTF32(const char *s,
2074 Py_ssize_t size,
2075 const char *errors,
2076 int *byteorder)
2077{
2078 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2079}
2080
2081PyObject *
2082PyUnicode_DecodeUTF32Stateful(const char *s,
2083 Py_ssize_t size,
2084 const char *errors,
2085 int *byteorder,
2086 Py_ssize_t *consumed)
2087{
2088 const char *starts = s;
2089 Py_ssize_t startinpos;
2090 Py_ssize_t endinpos;
2091 Py_ssize_t outpos;
2092 PyUnicodeObject *unicode;
2093 Py_UNICODE *p;
2094#ifndef Py_UNICODE_WIDE
2095 int i, pairs;
2096#else
2097 const int pairs = 0;
2098#endif
2099 const unsigned char *q, *e;
2100 int bo = 0; /* assume native ordering by default */
2101 const char *errmsg = "";
Walter Dörwald20b40d32007-08-17 16:52:50 +00002102 /* Offsets from q for retrieving bytes in the right order. */
2103#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2104 int iorder[] = {0, 1, 2, 3};
2105#else
2106 int iorder[] = {3, 2, 1, 0};
2107#endif
Walter Dörwald9ab80a92007-08-17 16:58:43 +00002108 PyObject *errorHandler = NULL;
2109 PyObject *exc = NULL;
Walter Dörwald6e390802007-08-17 16:41:28 +00002110 /* On narrow builds we split characters outside the BMP into two
2111 codepoints => count how much extra space we need. */
2112#ifndef Py_UNICODE_WIDE
2113 for (i = pairs = 0; i < size/4; i++)
2114 if (((Py_UCS4 *)s)[i] >= 0x10000)
2115 pairs++;
2116#endif
Walter Dörwald6e390802007-08-17 16:41:28 +00002117
2118 /* This might be one to much, because of a BOM */
2119 unicode = _PyUnicode_New((size+3)/4+pairs);
2120 if (!unicode)
2121 return NULL;
2122 if (size == 0)
2123 return (PyObject *)unicode;
2124
2125 /* Unpack UTF-32 encoded data */
2126 p = unicode->str;
2127 q = (unsigned char *)s;
2128 e = q + size;
2129
2130 if (byteorder)
2131 bo = *byteorder;
2132
2133 /* Check for BOM marks (U+FEFF) in the input and adjust current
2134 byte order setting accordingly. In native mode, the leading BOM
2135 mark is skipped, in all other modes, it is copied to the output
2136 stream as-is (giving a ZWNBSP character). */
2137 if (bo == 0) {
2138 if (size >= 4) {
2139 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2140 (q[iorder[1]] << 8) | q[iorder[0]];
2141#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2142 if (bom == 0x0000FEFF) {
2143 q += 4;
2144 bo = -1;
2145 }
2146 else if (bom == 0xFFFE0000) {
2147 q += 4;
2148 bo = 1;
2149 }
2150#else
2151 if (bom == 0x0000FEFF) {
2152 q += 4;
2153 bo = 1;
2154 }
2155 else if (bom == 0xFFFE0000) {
2156 q += 4;
2157 bo = -1;
2158 }
2159#endif
2160 }
2161 }
2162
2163 if (bo == -1) {
2164 /* force LE */
2165 iorder[0] = 0;
2166 iorder[1] = 1;
2167 iorder[2] = 2;
2168 iorder[3] = 3;
2169 }
2170 else if (bo == 1) {
2171 /* force BE */
2172 iorder[0] = 3;
2173 iorder[1] = 2;
2174 iorder[2] = 1;
2175 iorder[3] = 0;
2176 }
2177
2178 while (q < e) {
2179 Py_UCS4 ch;
2180 /* remaining bytes at the end? (size should be divisible by 4) */
2181 if (e-q<4) {
2182 if (consumed)
2183 break;
2184 errmsg = "truncated data";
2185 startinpos = ((const char *)q)-starts;
2186 endinpos = ((const char *)e)-starts;
2187 goto utf32Error;
2188 /* The remaining input chars are ignored if the callback
2189 chooses to skip the input */
2190 }
2191 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2192 (q[iorder[1]] << 8) | q[iorder[0]];
2193
2194 if (ch >= 0x110000)
2195 {
2196 errmsg = "codepoint not in range(0x110000)";
2197 startinpos = ((const char *)q)-starts;
2198 endinpos = startinpos+4;
2199 goto utf32Error;
2200 }
2201#ifndef Py_UNICODE_WIDE
2202 if (ch >= 0x10000)
2203 {
2204 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2205 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2206 }
2207 else
2208#endif
2209 *p++ = ch;
2210 q += 4;
2211 continue;
2212 utf32Error:
2213 outpos = p-PyUnicode_AS_UNICODE(unicode);
2214 if (unicode_decode_call_errorhandler(
2215 errors, &errorHandler,
2216 "utf32", errmsg,
2217 starts, size, &startinpos, &endinpos, &exc, &s,
2218 (PyObject **)&unicode, &outpos, &p))
2219 goto onError;
2220 }
2221
2222 if (byteorder)
2223 *byteorder = bo;
2224
2225 if (consumed)
2226 *consumed = (const char *)q-starts;
2227
2228 /* Adjust length */
2229 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2230 goto onError;
2231
2232 Py_XDECREF(errorHandler);
2233 Py_XDECREF(exc);
2234 return (PyObject *)unicode;
2235
2236onError:
2237 Py_DECREF(unicode);
2238 Py_XDECREF(errorHandler);
2239 Py_XDECREF(exc);
2240 return NULL;
2241}
2242
2243PyObject *
2244PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2245 Py_ssize_t size,
2246 const char *errors,
2247 int byteorder)
2248{
2249 PyObject *v;
2250 unsigned char *p;
2251#ifndef Py_UNICODE_WIDE
2252 int i, pairs;
2253#else
2254 const int pairs = 0;
2255#endif
2256 /* Offsets from p for storing byte pairs in the right order. */
2257#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2258 int iorder[] = {0, 1, 2, 3};
2259#else
2260 int iorder[] = {3, 2, 1, 0};
2261#endif
2262
2263#define STORECHAR(CH) \
2264 do { \
2265 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2266 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2267 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2268 p[iorder[0]] = (CH) & 0xff; \
2269 p += 4; \
2270 } while(0)
2271
2272 /* In narrow builds we can output surrogate pairs as one codepoint,
2273 so we need less space. */
2274#ifndef Py_UNICODE_WIDE
2275 for (i = pairs = 0; i < size-1; i++)
2276 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2277 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2278 pairs++;
2279#endif
2280 v = PyString_FromStringAndSize(NULL,
2281 4 * (size - pairs + (byteorder == 0)));
2282 if (v == NULL)
2283 return NULL;
2284
2285 p = (unsigned char *)PyString_AS_STRING(v);
2286 if (byteorder == 0)
2287 STORECHAR(0xFEFF);
2288 if (size == 0)
2289 return v;
2290
2291 if (byteorder == -1) {
2292 /* force LE */
2293 iorder[0] = 0;
2294 iorder[1] = 1;
2295 iorder[2] = 2;
2296 iorder[3] = 3;
2297 }
2298 else if (byteorder == 1) {
2299 /* force BE */
2300 iorder[0] = 3;
2301 iorder[1] = 2;
2302 iorder[2] = 1;
2303 iorder[3] = 0;
2304 }
2305
2306 while (size-- > 0) {
2307 Py_UCS4 ch = *s++;
2308#ifndef Py_UNICODE_WIDE
2309 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2310 Py_UCS4 ch2 = *s;
2311 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2312 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2313 s++;
2314 size--;
2315 }
2316 }
2317#endif
2318 STORECHAR(ch);
2319 }
2320 return v;
2321#undef STORECHAR
2322}
2323
2324PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2325{
2326 if (!PyUnicode_Check(unicode)) {
2327 PyErr_BadArgument();
2328 return NULL;
2329 }
2330 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2331 PyUnicode_GET_SIZE(unicode),
2332 NULL,
2333 0);
2334}
2335
Guido van Rossumd57fd912000-03-10 22:53:23 +00002336/* --- UTF-16 Codec ------------------------------------------------------- */
2337
Tim Peters772747b2001-08-09 22:21:55 +00002338PyObject *
2339PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002340 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002341 const char *errors,
2342 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002343{
Walter Dörwald69652032004-09-07 20:24:22 +00002344 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2345}
2346
2347PyObject *
2348PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002349 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002350 const char *errors,
2351 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002352 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002353{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002354 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002355 Py_ssize_t startinpos;
2356 Py_ssize_t endinpos;
2357 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002358 PyUnicodeObject *unicode;
2359 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002360 const unsigned char *q, *e;
2361 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002362 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002363 /* Offsets from q for retrieving byte pairs in the right order. */
2364#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2365 int ihi = 1, ilo = 0;
2366#else
2367 int ihi = 0, ilo = 1;
2368#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002369 PyObject *errorHandler = NULL;
2370 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002371
2372 /* Note: size will always be longer than the resulting Unicode
2373 character count */
2374 unicode = _PyUnicode_New(size);
2375 if (!unicode)
2376 return NULL;
2377 if (size == 0)
2378 return (PyObject *)unicode;
2379
2380 /* Unpack UTF-16 encoded data */
2381 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002382 q = (unsigned char *)s;
2383 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002384
2385 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002386 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002387
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002388 /* Check for BOM marks (U+FEFF) in the input and adjust current
2389 byte order setting accordingly. In native mode, the leading BOM
2390 mark is skipped, in all other modes, it is copied to the output
2391 stream as-is (giving a ZWNBSP character). */
2392 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002393 if (size >= 2) {
2394 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002395#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002396 if (bom == 0xFEFF) {
2397 q += 2;
2398 bo = -1;
2399 }
2400 else if (bom == 0xFFFE) {
2401 q += 2;
2402 bo = 1;
2403 }
Tim Petersced69f82003-09-16 20:30:58 +00002404#else
Walter Dörwald69652032004-09-07 20:24:22 +00002405 if (bom == 0xFEFF) {
2406 q += 2;
2407 bo = 1;
2408 }
2409 else if (bom == 0xFFFE) {
2410 q += 2;
2411 bo = -1;
2412 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002413#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002414 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002415 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002416
Tim Peters772747b2001-08-09 22:21:55 +00002417 if (bo == -1) {
2418 /* force LE */
2419 ihi = 1;
2420 ilo = 0;
2421 }
2422 else if (bo == 1) {
2423 /* force BE */
2424 ihi = 0;
2425 ilo = 1;
2426 }
2427
2428 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002429 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002430 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002431 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002432 if (consumed)
2433 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002434 errmsg = "truncated data";
2435 startinpos = ((const char *)q)-starts;
2436 endinpos = ((const char *)e)-starts;
2437 goto utf16Error;
2438 /* The remaining input chars are ignored if the callback
2439 chooses to skip the input */
2440 }
2441 ch = (q[ihi] << 8) | q[ilo];
2442
Tim Peters772747b2001-08-09 22:21:55 +00002443 q += 2;
2444
Guido van Rossumd57fd912000-03-10 22:53:23 +00002445 if (ch < 0xD800 || ch > 0xDFFF) {
2446 *p++ = ch;
2447 continue;
2448 }
2449
2450 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002451 if (q >= e) {
2452 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002453 startinpos = (((const char *)q)-2)-starts;
2454 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002455 goto utf16Error;
2456 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002457 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002458 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2459 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002460 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002461#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002462 *p++ = ch;
2463 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002464#else
2465 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002466#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002467 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002468 }
2469 else {
2470 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002471 startinpos = (((const char *)q)-4)-starts;
2472 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002473 goto utf16Error;
2474 }
2475
Guido van Rossumd57fd912000-03-10 22:53:23 +00002476 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002477 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002478 startinpos = (((const char *)q)-2)-starts;
2479 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002480 /* Fall through to report the error */
2481
2482 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002483 outpos = p-PyUnicode_AS_UNICODE(unicode);
2484 if (unicode_decode_call_errorhandler(
2485 errors, &errorHandler,
2486 "utf16", errmsg,
2487 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2488 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002489 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002490 }
2491
2492 if (byteorder)
2493 *byteorder = bo;
2494
Walter Dörwald69652032004-09-07 20:24:22 +00002495 if (consumed)
2496 *consumed = (const char *)q-starts;
2497
Guido van Rossumd57fd912000-03-10 22:53:23 +00002498 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002499 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002500 goto onError;
2501
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002502 Py_XDECREF(errorHandler);
2503 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002504 return (PyObject *)unicode;
2505
2506onError:
2507 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002508 Py_XDECREF(errorHandler);
2509 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002510 return NULL;
2511}
2512
Tim Peters772747b2001-08-09 22:21:55 +00002513PyObject *
2514PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002515 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002516 const char *errors,
2517 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002518{
2519 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002520 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002521#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00002522 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002523#else
2524 const int pairs = 0;
2525#endif
Tim Peters772747b2001-08-09 22:21:55 +00002526 /* Offsets from p for storing byte pairs in the right order. */
2527#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2528 int ihi = 1, ilo = 0;
2529#else
2530 int ihi = 0, ilo = 1;
2531#endif
2532
2533#define STORECHAR(CH) \
2534 do { \
2535 p[ihi] = ((CH) >> 8) & 0xff; \
2536 p[ilo] = (CH) & 0xff; \
2537 p += 2; \
2538 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002539
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002540#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002541 for (i = pairs = 0; i < size; i++)
2542 if (s[i] >= 0x10000)
2543 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002544#endif
Tim Petersced69f82003-09-16 20:30:58 +00002545 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002546 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002547 if (v == NULL)
2548 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002549
Tim Peters772747b2001-08-09 22:21:55 +00002550 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002551 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002552 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002553 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002554 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002555
2556 if (byteorder == -1) {
2557 /* force LE */
2558 ihi = 1;
2559 ilo = 0;
2560 }
2561 else if (byteorder == 1) {
2562 /* force BE */
2563 ihi = 0;
2564 ilo = 1;
2565 }
2566
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002567 while (size-- > 0) {
2568 Py_UNICODE ch = *s++;
2569 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002570#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002571 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002572 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2573 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002574 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002575#endif
Tim Peters772747b2001-08-09 22:21:55 +00002576 STORECHAR(ch);
2577 if (ch2)
2578 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002579 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002580 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002581#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002582}
2583
2584PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2585{
2586 if (!PyUnicode_Check(unicode)) {
2587 PyErr_BadArgument();
2588 return NULL;
2589 }
2590 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2591 PyUnicode_GET_SIZE(unicode),
2592 NULL,
2593 0);
2594}
2595
2596/* --- Unicode Escape Codec ----------------------------------------------- */
2597
Fredrik Lundh06d12682001-01-24 07:59:11 +00002598static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002599
Guido van Rossumd57fd912000-03-10 22:53:23 +00002600PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002601 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002602 const char *errors)
2603{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002604 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002605 Py_ssize_t startinpos;
2606 Py_ssize_t endinpos;
2607 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002608 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002609 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002610 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002611 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002612 char* message;
2613 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002614 PyObject *errorHandler = NULL;
2615 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002616
Guido van Rossumd57fd912000-03-10 22:53:23 +00002617 /* Escaped strings will always be longer than the resulting
2618 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002619 length after conversion to the true value.
2620 (but if the error callback returns a long replacement string
2621 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002622 v = _PyUnicode_New(size);
2623 if (v == NULL)
2624 goto onError;
2625 if (size == 0)
2626 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002627
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002628 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002629 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002630
Guido van Rossumd57fd912000-03-10 22:53:23 +00002631 while (s < end) {
2632 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002633 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002634 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002635
2636 /* Non-escape characters are interpreted as Unicode ordinals */
2637 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002638 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002639 continue;
2640 }
2641
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002642 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002643 /* \ - Escapes */
2644 s++;
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002645 c = *s++;
2646 if (s > end)
2647 c = '\0'; /* Invalid after \ */
2648 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002649
2650 /* \x escapes */
2651 case '\n': break;
2652 case '\\': *p++ = '\\'; break;
2653 case '\'': *p++ = '\''; break;
2654 case '\"': *p++ = '\"'; break;
2655 case 'b': *p++ = '\b'; break;
2656 case 'f': *p++ = '\014'; break; /* FF */
2657 case 't': *p++ = '\t'; break;
2658 case 'n': *p++ = '\n'; break;
2659 case 'r': *p++ = '\r'; break;
2660 case 'v': *p++ = '\013'; break; /* VT */
2661 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2662
2663 /* \OOO (octal) escapes */
2664 case '0': case '1': case '2': case '3':
2665 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002666 x = s[-1] - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002667 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002668 x = (x<<3) + *s++ - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002669 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002670 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002671 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002672 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002673 break;
2674
Fredrik Lundhccc74732001-02-18 22:13:49 +00002675 /* hex escapes */
2676 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002677 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002678 digits = 2;
2679 message = "truncated \\xXX escape";
2680 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002681
Fredrik Lundhccc74732001-02-18 22:13:49 +00002682 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002683 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002684 digits = 4;
2685 message = "truncated \\uXXXX escape";
2686 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002687
Fredrik Lundhccc74732001-02-18 22:13:49 +00002688 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002689 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002690 digits = 8;
2691 message = "truncated \\UXXXXXXXX escape";
2692 hexescape:
2693 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002694 outpos = p-PyUnicode_AS_UNICODE(v);
2695 if (s+digits>end) {
2696 endinpos = size;
2697 if (unicode_decode_call_errorhandler(
2698 errors, &errorHandler,
2699 "unicodeescape", "end of string in escape sequence",
2700 starts, size, &startinpos, &endinpos, &exc, &s,
2701 (PyObject **)&v, &outpos, &p))
2702 goto onError;
2703 goto nextByte;
2704 }
2705 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002706 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002707 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002708 endinpos = (s+i+1)-starts;
2709 if (unicode_decode_call_errorhandler(
2710 errors, &errorHandler,
2711 "unicodeescape", message,
2712 starts, size, &startinpos, &endinpos, &exc, &s,
2713 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002714 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002715 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002716 }
2717 chr = (chr<<4) & ~0xF;
2718 if (c >= '0' && c <= '9')
2719 chr += c - '0';
2720 else if (c >= 'a' && c <= 'f')
2721 chr += 10 + c - 'a';
2722 else
2723 chr += 10 + c - 'A';
2724 }
2725 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002726 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002727 /* _decoding_error will have already written into the
2728 target buffer. */
2729 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002730 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002731 /* when we get here, chr is a 32-bit unicode character */
2732 if (chr <= 0xffff)
2733 /* UCS-2 character */
2734 *p++ = (Py_UNICODE) chr;
2735 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002736 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002737 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002738#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002739 *p++ = chr;
2740#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002741 chr -= 0x10000L;
2742 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002743 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002744#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002745 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002746 endinpos = s-starts;
2747 outpos = p-PyUnicode_AS_UNICODE(v);
2748 if (unicode_decode_call_errorhandler(
2749 errors, &errorHandler,
2750 "unicodeescape", "illegal Unicode character",
2751 starts, size, &startinpos, &endinpos, &exc, &s,
2752 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002753 goto onError;
2754 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002755 break;
2756
2757 /* \N{name} */
2758 case 'N':
2759 message = "malformed \\N character escape";
2760 if (ucnhash_CAPI == NULL) {
2761 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002762 PyObject *m, *api;
Christian Heimes000a0742008-01-03 22:16:32 +00002763 m = PyImport_ImportModuleNoBlock("unicodedata");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002764 if (m == NULL)
2765 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002766 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002767 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002768 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002769 goto ucnhashError;
Anthony Baxtera6286212006-04-11 07:42:36 +00002770 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002771 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002772 if (ucnhash_CAPI == NULL)
2773 goto ucnhashError;
2774 }
2775 if (*s == '{') {
2776 const char *start = s+1;
2777 /* look for the closing brace */
2778 while (*s != '}' && s < end)
2779 s++;
2780 if (s > start && s < end && *s == '}') {
2781 /* found a name. look it up in the unicode database */
2782 message = "unknown Unicode character name";
2783 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002784 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002785 goto store;
2786 }
2787 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002788 endinpos = s-starts;
2789 outpos = p-PyUnicode_AS_UNICODE(v);
2790 if (unicode_decode_call_errorhandler(
2791 errors, &errorHandler,
2792 "unicodeescape", message,
2793 starts, size, &startinpos, &endinpos, &exc, &s,
2794 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002795 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002796 break;
2797
2798 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002799 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002800 message = "\\ at end of string";
2801 s--;
2802 endinpos = s-starts;
2803 outpos = p-PyUnicode_AS_UNICODE(v);
2804 if (unicode_decode_call_errorhandler(
2805 errors, &errorHandler,
2806 "unicodeescape", message,
2807 starts, size, &startinpos, &endinpos, &exc, &s,
2808 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002809 goto onError;
2810 }
2811 else {
2812 *p++ = '\\';
2813 *p++ = (unsigned char)s[-1];
2814 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002815 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002816 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002817 nextByte:
2818 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002819 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002820 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002821 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002822 Py_XDECREF(errorHandler);
2823 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002824 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002825
Fredrik Lundhccc74732001-02-18 22:13:49 +00002826ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002827 PyErr_SetString(
2828 PyExc_UnicodeError,
2829 "\\N escapes not supported (can't load unicodedata module)"
2830 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002831 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002832 Py_XDECREF(errorHandler);
2833 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002834 return NULL;
2835
Fredrik Lundhccc74732001-02-18 22:13:49 +00002836onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002837 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002838 Py_XDECREF(errorHandler);
2839 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002840 return NULL;
2841}
2842
2843/* Return a Unicode-Escape string version of the Unicode object.
2844
2845 If quotes is true, the string is enclosed in u"" or u'' quotes as
2846 appropriate.
2847
2848*/
2849
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00002850Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Fredrik Lundh95e2a912006-05-26 11:38:15 +00002851 Py_ssize_t size,
2852 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00002853{
2854 /* like wcschr, but doesn't stop at NULL characters */
2855
2856 while (size-- > 0) {
2857 if (*s == ch)
2858 return s;
2859 s++;
2860 }
2861
2862 return NULL;
2863}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002864
Guido van Rossumd57fd912000-03-10 22:53:23 +00002865static
2866PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002867 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002868 int quotes)
2869{
2870 PyObject *repr;
2871 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002872
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002873 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002874
Neal Norwitz17753ec2006-08-21 22:21:19 +00002875 /* XXX(nnorwitz): rather than over-allocating, it would be
2876 better to choose a different scheme. Perhaps scan the
2877 first N-chars of the string and allocate based on that size.
2878 */
2879 /* Initial allocation is based on the longest-possible unichr
2880 escape.
2881
2882 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2883 unichr, so in this case it's the longest unichr escape. In
2884 narrow (UTF-16) builds this is five chars per source unichr
2885 since there are two unichrs in the surrogate pair, so in narrow
2886 (UTF-16) builds it's not the longest unichr escape.
2887
2888 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2889 so in the narrow (UTF-16) build case it's the longest unichr
2890 escape.
2891 */
2892
2893 repr = PyString_FromStringAndSize(NULL,
2894 2
2895#ifdef Py_UNICODE_WIDE
2896 + 10*size
2897#else
2898 + 6*size
2899#endif
2900 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002901 if (repr == NULL)
2902 return NULL;
2903
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002904 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002905
2906 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002907 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00002908 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00002909 !findchar(s, size, '"')) ? '"' : '\'';
2910 }
2911 while (size-- > 0) {
2912 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002913
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002914 /* Escape quotes and backslashes */
2915 if ((quotes &&
2916 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002917 *p++ = '\\';
2918 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00002919 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002920 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002921
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002922#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002923 /* Map 21-bit characters to '\U00xxxxxx' */
2924 else if (ch >= 0x10000) {
2925 *p++ = '\\';
2926 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002927 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2928 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2929 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2930 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2931 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2932 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2933 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002934 *p++ = hexdigit[ch & 0x0000000F];
2935 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002936 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00002937#else
2938 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002939 else if (ch >= 0xD800 && ch < 0xDC00) {
2940 Py_UNICODE ch2;
2941 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002942
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002943 ch2 = *s++;
2944 size--;
2945 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2946 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2947 *p++ = '\\';
2948 *p++ = 'U';
2949 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2950 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2951 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2952 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2953 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2954 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2955 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2956 *p++ = hexdigit[ucs & 0x0000000F];
2957 continue;
2958 }
2959 /* Fall through: isolated surrogates are copied as-is */
2960 s--;
2961 size++;
2962 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00002963#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002964
Guido van Rossumd57fd912000-03-10 22:53:23 +00002965 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002966 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002967 *p++ = '\\';
2968 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002969 *p++ = hexdigit[(ch >> 12) & 0x000F];
2970 *p++ = hexdigit[(ch >> 8) & 0x000F];
2971 *p++ = hexdigit[(ch >> 4) & 0x000F];
2972 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002973 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002974
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002975 /* Map special whitespace to '\t', \n', '\r' */
2976 else if (ch == '\t') {
2977 *p++ = '\\';
2978 *p++ = 't';
2979 }
2980 else if (ch == '\n') {
2981 *p++ = '\\';
2982 *p++ = 'n';
2983 }
2984 else if (ch == '\r') {
2985 *p++ = '\\';
2986 *p++ = 'r';
2987 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002988
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002989 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002990 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002991 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002992 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002993 *p++ = hexdigit[(ch >> 4) & 0x000F];
2994 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002995 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002996
Guido van Rossumd57fd912000-03-10 22:53:23 +00002997 /* Copy everything else as-is */
2998 else
2999 *p++ = (char) ch;
3000 }
3001 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003002 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003003
3004 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00003005 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003006 return repr;
3007}
3008
3009PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003010 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003011{
3012 return unicodeescape_string(s, size, 0);
3013}
3014
3015PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3016{
3017 if (!PyUnicode_Check(unicode)) {
3018 PyErr_BadArgument();
3019 return NULL;
3020 }
3021 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3022 PyUnicode_GET_SIZE(unicode));
3023}
3024
3025/* --- Raw Unicode Escape Codec ------------------------------------------- */
3026
3027PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003028 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003029 const char *errors)
3030{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003031 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003032 Py_ssize_t startinpos;
3033 Py_ssize_t endinpos;
3034 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003035 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003036 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003037 const char *end;
3038 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003039 PyObject *errorHandler = NULL;
3040 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003041
Guido van Rossumd57fd912000-03-10 22:53:23 +00003042 /* Escaped strings will always be longer than the resulting
3043 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003044 length after conversion to the true value. (But decoding error
3045 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003046 v = _PyUnicode_New(size);
3047 if (v == NULL)
3048 goto onError;
3049 if (size == 0)
3050 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003051 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003052 end = s + size;
3053 while (s < end) {
3054 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00003055 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003056 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003057 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003058
3059 /* Non-escape characters are interpreted as Unicode ordinals */
3060 if (*s != '\\') {
3061 *p++ = (unsigned char)*s++;
3062 continue;
3063 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003064 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003065
3066 /* \u-escapes are only interpreted iff the number of leading
3067 backslashes if odd */
3068 bs = s;
3069 for (;s < end;) {
3070 if (*s != '\\')
3071 break;
3072 *p++ = (unsigned char)*s++;
3073 }
3074 if (((s - bs) & 1) == 0 ||
3075 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003076 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003077 continue;
3078 }
3079 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003080 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003081 s++;
3082
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003083 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003084 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003085 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003086 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003087 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003088 endinpos = s-starts;
3089 if (unicode_decode_call_errorhandler(
3090 errors, &errorHandler,
3091 "rawunicodeescape", "truncated \\uXXXX",
3092 starts, size, &startinpos, &endinpos, &exc, &s,
3093 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003094 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003095 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003096 }
3097 x = (x<<4) & ~0xF;
3098 if (c >= '0' && c <= '9')
3099 x += c - '0';
3100 else if (c >= 'a' && c <= 'f')
3101 x += 10 + c - 'a';
3102 else
3103 x += 10 + c - 'A';
3104 }
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003105 if (x <= 0xffff)
3106 /* UCS-2 character */
3107 *p++ = (Py_UNICODE) x;
3108 else if (x <= 0x10ffff) {
3109 /* UCS-4 character. Either store directly, or as
3110 surrogate pair. */
3111#ifdef Py_UNICODE_WIDE
Amaury Forgeot d'Arcfac02fa2008-03-24 21:04:10 +00003112 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003113#else
3114 x -= 0x10000L;
3115 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3116 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3117#endif
3118 } else {
3119 endinpos = s-starts;
3120 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003121 if (unicode_decode_call_errorhandler(
3122 errors, &errorHandler,
3123 "rawunicodeescape", "\\Uxxxxxxxx out of range",
3124 starts, size, &startinpos, &endinpos, &exc, &s,
3125 (PyObject **)&v, &outpos, &p))
3126 goto onError;
3127 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003128 nextByte:
3129 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003130 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00003131 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003132 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003133 Py_XDECREF(errorHandler);
3134 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003135 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003136
Guido van Rossumd57fd912000-03-10 22:53:23 +00003137 onError:
3138 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003139 Py_XDECREF(errorHandler);
3140 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003141 return NULL;
3142}
3143
3144PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003145 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003146{
3147 PyObject *repr;
3148 char *p;
3149 char *q;
3150
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003151 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003152
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003153#ifdef Py_UNICODE_WIDE
3154 repr = PyString_FromStringAndSize(NULL, 10 * size);
3155#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003156 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003157#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003158 if (repr == NULL)
3159 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003160 if (size == 0)
3161 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003162
3163 p = q = PyString_AS_STRING(repr);
3164 while (size-- > 0) {
3165 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003166#ifdef Py_UNICODE_WIDE
3167 /* Map 32-bit characters to '\Uxxxxxxxx' */
3168 if (ch >= 0x10000) {
3169 *p++ = '\\';
3170 *p++ = 'U';
3171 *p++ = hexdigit[(ch >> 28) & 0xf];
3172 *p++ = hexdigit[(ch >> 24) & 0xf];
3173 *p++ = hexdigit[(ch >> 20) & 0xf];
3174 *p++ = hexdigit[(ch >> 16) & 0xf];
3175 *p++ = hexdigit[(ch >> 12) & 0xf];
3176 *p++ = hexdigit[(ch >> 8) & 0xf];
3177 *p++ = hexdigit[(ch >> 4) & 0xf];
3178 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003179 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003180 else
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003181#else
3182 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3183 if (ch >= 0xD800 && ch < 0xDC00) {
3184 Py_UNICODE ch2;
3185 Py_UCS4 ucs;
3186
3187 ch2 = *s++;
3188 size--;
3189 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3190 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3191 *p++ = '\\';
3192 *p++ = 'U';
3193 *p++ = hexdigit[(ucs >> 28) & 0xf];
3194 *p++ = hexdigit[(ucs >> 24) & 0xf];
3195 *p++ = hexdigit[(ucs >> 20) & 0xf];
3196 *p++ = hexdigit[(ucs >> 16) & 0xf];
3197 *p++ = hexdigit[(ucs >> 12) & 0xf];
3198 *p++ = hexdigit[(ucs >> 8) & 0xf];
3199 *p++ = hexdigit[(ucs >> 4) & 0xf];
3200 *p++ = hexdigit[ucs & 0xf];
3201 continue;
3202 }
3203 /* Fall through: isolated surrogates are copied as-is */
3204 s--;
3205 size++;
3206 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003207#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003208 /* Map 16-bit characters to '\uxxxx' */
3209 if (ch >= 256) {
3210 *p++ = '\\';
3211 *p++ = 'u';
3212 *p++ = hexdigit[(ch >> 12) & 0xf];
3213 *p++ = hexdigit[(ch >> 8) & 0xf];
3214 *p++ = hexdigit[(ch >> 4) & 0xf];
3215 *p++ = hexdigit[ch & 15];
3216 }
3217 /* Copy everything else as-is */
3218 else
3219 *p++ = (char) ch;
3220 }
3221 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00003222 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003223 return repr;
3224}
3225
3226PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3227{
3228 if (!PyUnicode_Check(unicode)) {
3229 PyErr_BadArgument();
3230 return NULL;
3231 }
3232 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3233 PyUnicode_GET_SIZE(unicode));
3234}
3235
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003236/* --- Unicode Internal Codec ------------------------------------------- */
3237
3238PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003239 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003240 const char *errors)
3241{
3242 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003243 Py_ssize_t startinpos;
3244 Py_ssize_t endinpos;
3245 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003246 PyUnicodeObject *v;
3247 Py_UNICODE *p;
3248 const char *end;
3249 const char *reason;
3250 PyObject *errorHandler = NULL;
3251 PyObject *exc = NULL;
3252
Neal Norwitzd43069c2006-01-08 01:12:10 +00003253#ifdef Py_UNICODE_WIDE
3254 Py_UNICODE unimax = PyUnicode_GetMax();
3255#endif
3256
Armin Rigo7ccbca92006-10-04 12:17:45 +00003257 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003258 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3259 if (v == NULL)
3260 goto onError;
3261 if (PyUnicode_GetSize((PyObject *)v) == 0)
3262 return (PyObject *)v;
3263 p = PyUnicode_AS_UNICODE(v);
3264 end = s + size;
3265
3266 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00003267 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003268 /* We have to sanity check the raw data, otherwise doom looms for
3269 some malformed UCS-4 data. */
3270 if (
3271 #ifdef Py_UNICODE_WIDE
3272 *p > unimax || *p < 0 ||
3273 #endif
3274 end-s < Py_UNICODE_SIZE
3275 )
3276 {
3277 startinpos = s - starts;
3278 if (end-s < Py_UNICODE_SIZE) {
3279 endinpos = end-starts;
3280 reason = "truncated input";
3281 }
3282 else {
3283 endinpos = s - starts + Py_UNICODE_SIZE;
3284 reason = "illegal code point (> 0x10FFFF)";
3285 }
3286 outpos = p - PyUnicode_AS_UNICODE(v);
3287 if (unicode_decode_call_errorhandler(
3288 errors, &errorHandler,
3289 "unicode_internal", reason,
3290 starts, size, &startinpos, &endinpos, &exc, &s,
3291 (PyObject **)&v, &outpos, &p)) {
3292 goto onError;
3293 }
3294 }
3295 else {
3296 p++;
3297 s += Py_UNICODE_SIZE;
3298 }
3299 }
3300
Martin v. Löwis412fb672006-04-13 06:34:32 +00003301 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003302 goto onError;
3303 Py_XDECREF(errorHandler);
3304 Py_XDECREF(exc);
3305 return (PyObject *)v;
3306
3307 onError:
3308 Py_XDECREF(v);
3309 Py_XDECREF(errorHandler);
3310 Py_XDECREF(exc);
3311 return NULL;
3312}
3313
Guido van Rossumd57fd912000-03-10 22:53:23 +00003314/* --- Latin-1 Codec ------------------------------------------------------ */
3315
3316PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003317 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003318 const char *errors)
3319{
3320 PyUnicodeObject *v;
3321 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003322
Guido van Rossumd57fd912000-03-10 22:53:23 +00003323 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003324 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003325 Py_UNICODE r = *(unsigned char*)s;
3326 return PyUnicode_FromUnicode(&r, 1);
3327 }
3328
Guido van Rossumd57fd912000-03-10 22:53:23 +00003329 v = _PyUnicode_New(size);
3330 if (v == NULL)
3331 goto onError;
3332 if (size == 0)
3333 return (PyObject *)v;
3334 p = PyUnicode_AS_UNICODE(v);
3335 while (size-- > 0)
3336 *p++ = (unsigned char)*s++;
3337 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003338
Guido van Rossumd57fd912000-03-10 22:53:23 +00003339 onError:
3340 Py_XDECREF(v);
3341 return NULL;
3342}
3343
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003344/* create or adjust a UnicodeEncodeError */
3345static void make_encode_exception(PyObject **exceptionObject,
3346 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003347 const Py_UNICODE *unicode, Py_ssize_t size,
3348 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003349 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003350{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003351 if (*exceptionObject == NULL) {
3352 *exceptionObject = PyUnicodeEncodeError_Create(
3353 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003354 }
3355 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003356 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3357 goto onError;
3358 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3359 goto onError;
3360 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3361 goto onError;
3362 return;
3363 onError:
3364 Py_DECREF(*exceptionObject);
3365 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003366 }
3367}
3368
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003369/* raises a UnicodeEncodeError */
3370static void raise_encode_exception(PyObject **exceptionObject,
3371 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003372 const Py_UNICODE *unicode, Py_ssize_t size,
3373 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003374 const char *reason)
3375{
3376 make_encode_exception(exceptionObject,
3377 encoding, unicode, size, startpos, endpos, reason);
3378 if (*exceptionObject != NULL)
3379 PyCodec_StrictErrors(*exceptionObject);
3380}
3381
3382/* error handling callback helper:
3383 build arguments, call the callback and check the arguments,
3384 put the result into newpos and return the replacement string, which
3385 has to be freed by the caller */
3386static PyObject *unicode_encode_call_errorhandler(const char *errors,
3387 PyObject **errorHandler,
3388 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003389 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3390 Py_ssize_t startpos, Py_ssize_t endpos,
3391 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003392{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003393 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003394
3395 PyObject *restuple;
3396 PyObject *resunicode;
3397
3398 if (*errorHandler == NULL) {
3399 *errorHandler = PyCodec_LookupError(errors);
3400 if (*errorHandler == NULL)
3401 return NULL;
3402 }
3403
3404 make_encode_exception(exceptionObject,
3405 encoding, unicode, size, startpos, endpos, reason);
3406 if (*exceptionObject == NULL)
3407 return NULL;
3408
3409 restuple = PyObject_CallFunctionObjArgs(
3410 *errorHandler, *exceptionObject, NULL);
3411 if (restuple == NULL)
3412 return NULL;
3413 if (!PyTuple_Check(restuple)) {
3414 PyErr_Format(PyExc_TypeError, &argparse[4]);
3415 Py_DECREF(restuple);
3416 return NULL;
3417 }
3418 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3419 &resunicode, newpos)) {
3420 Py_DECREF(restuple);
3421 return NULL;
3422 }
3423 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003424 *newpos = size+*newpos;
3425 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003426 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003427 Py_DECREF(restuple);
3428 return NULL;
3429 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003430 Py_INCREF(resunicode);
3431 Py_DECREF(restuple);
3432 return resunicode;
3433}
3434
3435static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003436 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003437 const char *errors,
3438 int limit)
3439{
3440 /* output object */
3441 PyObject *res;
3442 /* pointers to the beginning and end+1 of input */
3443 const Py_UNICODE *startp = p;
3444 const Py_UNICODE *endp = p + size;
3445 /* pointer to the beginning of the unencodable characters */
3446 /* const Py_UNICODE *badp = NULL; */
3447 /* pointer into the output */
3448 char *str;
3449 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003450 Py_ssize_t respos = 0;
3451 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00003452 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3453 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003454 PyObject *errorHandler = NULL;
3455 PyObject *exc = NULL;
3456 /* the following variable is used for caching string comparisons
3457 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3458 int known_errorHandler = -1;
3459
3460 /* allocate enough for a simple encoding without
3461 replacements, if we need more, we'll resize */
3462 res = PyString_FromStringAndSize(NULL, size);
3463 if (res == NULL)
3464 goto onError;
3465 if (size == 0)
3466 return res;
3467 str = PyString_AS_STRING(res);
3468 ressize = size;
3469
3470 while (p<endp) {
3471 Py_UNICODE c = *p;
3472
3473 /* can we encode this? */
3474 if (c<limit) {
3475 /* no overflow check, because we know that the space is enough */
3476 *str++ = (char)c;
3477 ++p;
3478 }
3479 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003480 Py_ssize_t unicodepos = p-startp;
3481 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003482 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003483 Py_ssize_t repsize;
3484 Py_ssize_t newpos;
3485 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003486 Py_UNICODE *uni2;
3487 /* startpos for collecting unencodable chars */
3488 const Py_UNICODE *collstart = p;
3489 const Py_UNICODE *collend = p;
3490 /* find all unecodable characters */
3491 while ((collend < endp) && ((*collend)>=limit))
3492 ++collend;
3493 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3494 if (known_errorHandler==-1) {
3495 if ((errors==NULL) || (!strcmp(errors, "strict")))
3496 known_errorHandler = 1;
3497 else if (!strcmp(errors, "replace"))
3498 known_errorHandler = 2;
3499 else if (!strcmp(errors, "ignore"))
3500 known_errorHandler = 3;
3501 else if (!strcmp(errors, "xmlcharrefreplace"))
3502 known_errorHandler = 4;
3503 else
3504 known_errorHandler = 0;
3505 }
3506 switch (known_errorHandler) {
3507 case 1: /* strict */
3508 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3509 goto onError;
3510 case 2: /* replace */
3511 while (collstart++<collend)
3512 *str++ = '?'; /* fall through */
3513 case 3: /* ignore */
3514 p = collend;
3515 break;
3516 case 4: /* xmlcharrefreplace */
3517 respos = str-PyString_AS_STRING(res);
3518 /* determine replacement size (temporarily (mis)uses p) */
3519 for (p = collstart, repsize = 0; p < collend; ++p) {
3520 if (*p<10)
3521 repsize += 2+1+1;
3522 else if (*p<100)
3523 repsize += 2+2+1;
3524 else if (*p<1000)
3525 repsize += 2+3+1;
3526 else if (*p<10000)
3527 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003528#ifndef Py_UNICODE_WIDE
3529 else
3530 repsize += 2+5+1;
3531#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003532 else if (*p<100000)
3533 repsize += 2+5+1;
3534 else if (*p<1000000)
3535 repsize += 2+6+1;
3536 else
3537 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003538#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003539 }
3540 requiredsize = respos+repsize+(endp-collend);
3541 if (requiredsize > ressize) {
3542 if (requiredsize<2*ressize)
3543 requiredsize = 2*ressize;
3544 if (_PyString_Resize(&res, requiredsize))
3545 goto onError;
3546 str = PyString_AS_STRING(res) + respos;
3547 ressize = requiredsize;
3548 }
3549 /* generate replacement (temporarily (mis)uses p) */
3550 for (p = collstart; p < collend; ++p) {
3551 str += sprintf(str, "&#%d;", (int)*p);
3552 }
3553 p = collend;
3554 break;
3555 default:
3556 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3557 encoding, reason, startp, size, &exc,
3558 collstart-startp, collend-startp, &newpos);
3559 if (repunicode == NULL)
3560 goto onError;
3561 /* need more space? (at least enough for what we
3562 have+the replacement+the rest of the string, so
3563 we won't have to check space for encodable characters) */
3564 respos = str-PyString_AS_STRING(res);
3565 repsize = PyUnicode_GET_SIZE(repunicode);
3566 requiredsize = respos+repsize+(endp-collend);
3567 if (requiredsize > ressize) {
3568 if (requiredsize<2*ressize)
3569 requiredsize = 2*ressize;
3570 if (_PyString_Resize(&res, requiredsize)) {
3571 Py_DECREF(repunicode);
3572 goto onError;
3573 }
3574 str = PyString_AS_STRING(res) + respos;
3575 ressize = requiredsize;
3576 }
3577 /* check if there is anything unencodable in the replacement
3578 and copy it to the output */
3579 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3580 c = *uni2;
3581 if (c >= limit) {
3582 raise_encode_exception(&exc, encoding, startp, size,
3583 unicodepos, unicodepos+1, reason);
3584 Py_DECREF(repunicode);
3585 goto onError;
3586 }
3587 *str = (char)c;
3588 }
3589 p = startp + newpos;
3590 Py_DECREF(repunicode);
3591 }
3592 }
3593 }
3594 /* Resize if we allocated to much */
3595 respos = str-PyString_AS_STRING(res);
3596 if (respos<ressize)
3597 /* If this falls res will be NULL */
3598 _PyString_Resize(&res, respos);
3599 Py_XDECREF(errorHandler);
3600 Py_XDECREF(exc);
3601 return res;
3602
3603 onError:
3604 Py_XDECREF(res);
3605 Py_XDECREF(errorHandler);
3606 Py_XDECREF(exc);
3607 return NULL;
3608}
3609
Guido van Rossumd57fd912000-03-10 22:53:23 +00003610PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003611 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003612 const char *errors)
3613{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003614 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003615}
3616
3617PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3618{
3619 if (!PyUnicode_Check(unicode)) {
3620 PyErr_BadArgument();
3621 return NULL;
3622 }
3623 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3624 PyUnicode_GET_SIZE(unicode),
3625 NULL);
3626}
3627
3628/* --- 7-bit ASCII Codec -------------------------------------------------- */
3629
Guido van Rossumd57fd912000-03-10 22:53:23 +00003630PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003631 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003632 const char *errors)
3633{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003634 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003635 PyUnicodeObject *v;
3636 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003637 Py_ssize_t startinpos;
3638 Py_ssize_t endinpos;
3639 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003640 const char *e;
3641 PyObject *errorHandler = NULL;
3642 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003643
Guido van Rossumd57fd912000-03-10 22:53:23 +00003644 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003645 if (size == 1 && *(unsigned char*)s < 128) {
3646 Py_UNICODE r = *(unsigned char*)s;
3647 return PyUnicode_FromUnicode(&r, 1);
3648 }
Tim Petersced69f82003-09-16 20:30:58 +00003649
Guido van Rossumd57fd912000-03-10 22:53:23 +00003650 v = _PyUnicode_New(size);
3651 if (v == NULL)
3652 goto onError;
3653 if (size == 0)
3654 return (PyObject *)v;
3655 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003656 e = s + size;
3657 while (s < e) {
3658 register unsigned char c = (unsigned char)*s;
3659 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003660 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003661 ++s;
3662 }
3663 else {
3664 startinpos = s-starts;
3665 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003666 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003667 if (unicode_decode_call_errorhandler(
3668 errors, &errorHandler,
3669 "ascii", "ordinal not in range(128)",
3670 starts, size, &startinpos, &endinpos, &exc, &s,
3671 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003672 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003673 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003674 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003675 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00003676 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003677 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003678 Py_XDECREF(errorHandler);
3679 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003680 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003681
Guido van Rossumd57fd912000-03-10 22:53:23 +00003682 onError:
3683 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003684 Py_XDECREF(errorHandler);
3685 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003686 return NULL;
3687}
3688
Guido van Rossumd57fd912000-03-10 22:53:23 +00003689PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003690 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003691 const char *errors)
3692{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003693 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003694}
3695
3696PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3697{
3698 if (!PyUnicode_Check(unicode)) {
3699 PyErr_BadArgument();
3700 return NULL;
3701 }
3702 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3703 PyUnicode_GET_SIZE(unicode),
3704 NULL);
3705}
3706
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003707#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003708
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003709/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003710
Martin v. Löwisd8251432006-06-14 05:21:04 +00003711#if SIZEOF_INT < SIZEOF_SSIZE_T
3712#define NEED_RETRY
3713#endif
3714
3715/* XXX This code is limited to "true" double-byte encodings, as
3716 a) it assumes an incomplete character consists of a single byte, and
3717 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3718 encodings, see IsDBCSLeadByteEx documentation. */
3719
3720static int is_dbcs_lead_byte(const char *s, int offset)
3721{
3722 const char *curr = s + offset;
3723
3724 if (IsDBCSLeadByte(*curr)) {
3725 const char *prev = CharPrev(s, curr);
3726 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3727 }
3728 return 0;
3729}
3730
3731/*
3732 * Decode MBCS string into unicode object. If 'final' is set, converts
3733 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3734 */
3735static int decode_mbcs(PyUnicodeObject **v,
3736 const char *s, /* MBCS string */
3737 int size, /* sizeof MBCS string */
3738 int final)
3739{
3740 Py_UNICODE *p;
3741 Py_ssize_t n = 0;
3742 int usize = 0;
3743
3744 assert(size >= 0);
3745
3746 /* Skip trailing lead-byte unless 'final' is set */
3747 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3748 --size;
3749
3750 /* First get the size of the result */
3751 if (size > 0) {
3752 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3753 if (usize == 0) {
3754 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3755 return -1;
3756 }
3757 }
3758
3759 if (*v == NULL) {
3760 /* Create unicode object */
3761 *v = _PyUnicode_New(usize);
3762 if (*v == NULL)
3763 return -1;
3764 }
3765 else {
3766 /* Extend unicode object */
3767 n = PyUnicode_GET_SIZE(*v);
3768 if (_PyUnicode_Resize(v, n + usize) < 0)
3769 return -1;
3770 }
3771
3772 /* Do the conversion */
3773 if (size > 0) {
3774 p = PyUnicode_AS_UNICODE(*v) + n;
3775 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3776 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3777 return -1;
3778 }
3779 }
3780
3781 return size;
3782}
3783
3784PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3785 Py_ssize_t size,
3786 const char *errors,
3787 Py_ssize_t *consumed)
3788{
3789 PyUnicodeObject *v = NULL;
3790 int done;
3791
3792 if (consumed)
3793 *consumed = 0;
3794
3795#ifdef NEED_RETRY
3796 retry:
3797 if (size > INT_MAX)
3798 done = decode_mbcs(&v, s, INT_MAX, 0);
3799 else
3800#endif
3801 done = decode_mbcs(&v, s, (int)size, !consumed);
3802
3803 if (done < 0) {
3804 Py_XDECREF(v);
3805 return NULL;
3806 }
3807
3808 if (consumed)
3809 *consumed += done;
3810
3811#ifdef NEED_RETRY
3812 if (size > INT_MAX) {
3813 s += done;
3814 size -= done;
3815 goto retry;
3816 }
3817#endif
3818
3819 return (PyObject *)v;
3820}
3821
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003822PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003823 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003824 const char *errors)
3825{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003826 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3827}
3828
3829/*
3830 * Convert unicode into string object (MBCS).
3831 * Returns 0 if succeed, -1 otherwise.
3832 */
3833static int encode_mbcs(PyObject **repr,
3834 const Py_UNICODE *p, /* unicode */
3835 int size) /* size of unicode */
3836{
3837 int mbcssize = 0;
3838 Py_ssize_t n = 0;
3839
3840 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003841
3842 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003843 if (size > 0) {
3844 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3845 if (mbcssize == 0) {
3846 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3847 return -1;
3848 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003849 }
3850
Martin v. Löwisd8251432006-06-14 05:21:04 +00003851 if (*repr == NULL) {
3852 /* Create string object */
3853 *repr = PyString_FromStringAndSize(NULL, mbcssize);
3854 if (*repr == NULL)
3855 return -1;
3856 }
3857 else {
3858 /* Extend string object */
3859 n = PyString_Size(*repr);
3860 if (_PyString_Resize(repr, n + mbcssize) < 0)
3861 return -1;
3862 }
3863
3864 /* Do the conversion */
3865 if (size > 0) {
3866 char *s = PyString_AS_STRING(*repr) + n;
3867 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3868 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3869 return -1;
3870 }
3871 }
3872
3873 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003874}
3875
3876PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003877 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003878 const char *errors)
3879{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003880 PyObject *repr = NULL;
3881 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003882
Martin v. Löwisd8251432006-06-14 05:21:04 +00003883#ifdef NEED_RETRY
3884 retry:
3885 if (size > INT_MAX)
3886 ret = encode_mbcs(&repr, p, INT_MAX);
3887 else
3888#endif
3889 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003890
Martin v. Löwisd8251432006-06-14 05:21:04 +00003891 if (ret < 0) {
3892 Py_XDECREF(repr);
3893 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003894 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003895
3896#ifdef NEED_RETRY
3897 if (size > INT_MAX) {
3898 p += INT_MAX;
3899 size -= INT_MAX;
3900 goto retry;
3901 }
3902#endif
3903
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003904 return repr;
3905}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003906
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003907PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3908{
3909 if (!PyUnicode_Check(unicode)) {
3910 PyErr_BadArgument();
3911 return NULL;
3912 }
3913 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3914 PyUnicode_GET_SIZE(unicode),
3915 NULL);
3916}
3917
Martin v. Löwisd8251432006-06-14 05:21:04 +00003918#undef NEED_RETRY
3919
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003920#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003921
Guido van Rossumd57fd912000-03-10 22:53:23 +00003922/* --- Character Mapping Codec -------------------------------------------- */
3923
Guido van Rossumd57fd912000-03-10 22:53:23 +00003924PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003925 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003926 PyObject *mapping,
3927 const char *errors)
3928{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003929 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003930 Py_ssize_t startinpos;
3931 Py_ssize_t endinpos;
3932 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003933 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003934 PyUnicodeObject *v;
3935 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003936 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003937 PyObject *errorHandler = NULL;
3938 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003939 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003940 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003941
Guido van Rossumd57fd912000-03-10 22:53:23 +00003942 /* Default to Latin-1 */
3943 if (mapping == NULL)
3944 return PyUnicode_DecodeLatin1(s, size, errors);
3945
3946 v = _PyUnicode_New(size);
3947 if (v == NULL)
3948 goto onError;
3949 if (size == 0)
3950 return (PyObject *)v;
3951 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003952 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003953 if (PyUnicode_CheckExact(mapping)) {
3954 mapstring = PyUnicode_AS_UNICODE(mapping);
3955 maplen = PyUnicode_GET_SIZE(mapping);
3956 while (s < e) {
3957 unsigned char ch = *s;
3958 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003959
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003960 if (ch < maplen)
3961 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003962
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003963 if (x == 0xfffe) {
3964 /* undefined mapping */
3965 outpos = p-PyUnicode_AS_UNICODE(v);
3966 startinpos = s-starts;
3967 endinpos = startinpos+1;
3968 if (unicode_decode_call_errorhandler(
3969 errors, &errorHandler,
3970 "charmap", "character maps to <undefined>",
3971 starts, size, &startinpos, &endinpos, &exc, &s,
3972 (PyObject **)&v, &outpos, &p)) {
3973 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003974 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003975 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003976 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003977 *p++ = x;
3978 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003979 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003980 }
3981 else {
3982 while (s < e) {
3983 unsigned char ch = *s;
3984 PyObject *w, *x;
3985
3986 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3987 w = PyInt_FromLong((long)ch);
3988 if (w == NULL)
3989 goto onError;
3990 x = PyObject_GetItem(mapping, w);
3991 Py_DECREF(w);
3992 if (x == NULL) {
3993 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3994 /* No mapping found means: mapping is undefined. */
3995 PyErr_Clear();
3996 x = Py_None;
3997 Py_INCREF(x);
3998 } else
3999 goto onError;
4000 }
4001
4002 /* Apply mapping */
4003 if (PyInt_Check(x)) {
4004 long value = PyInt_AS_LONG(x);
4005 if (value < 0 || value > 65535) {
4006 PyErr_SetString(PyExc_TypeError,
4007 "character mapping must be in range(65536)");
4008 Py_DECREF(x);
4009 goto onError;
4010 }
4011 *p++ = (Py_UNICODE)value;
4012 }
4013 else if (x == Py_None) {
4014 /* undefined mapping */
4015 outpos = p-PyUnicode_AS_UNICODE(v);
4016 startinpos = s-starts;
4017 endinpos = startinpos+1;
4018 if (unicode_decode_call_errorhandler(
4019 errors, &errorHandler,
4020 "charmap", "character maps to <undefined>",
4021 starts, size, &startinpos, &endinpos, &exc, &s,
4022 (PyObject **)&v, &outpos, &p)) {
4023 Py_DECREF(x);
4024 goto onError;
4025 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00004026 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004027 continue;
4028 }
4029 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004030 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004031
4032 if (targetsize == 1)
4033 /* 1-1 mapping */
4034 *p++ = *PyUnicode_AS_UNICODE(x);
4035
4036 else if (targetsize > 1) {
4037 /* 1-n mapping */
4038 if (targetsize > extrachars) {
4039 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004040 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4041 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004042 (targetsize << 2);
4043 extrachars += needed;
Armin Rigo7ccbca92006-10-04 12:17:45 +00004044 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004045 if (_PyUnicode_Resize(&v,
4046 PyUnicode_GET_SIZE(v) + needed) < 0) {
4047 Py_DECREF(x);
4048 goto onError;
4049 }
4050 p = PyUnicode_AS_UNICODE(v) + oldpos;
4051 }
4052 Py_UNICODE_COPY(p,
4053 PyUnicode_AS_UNICODE(x),
4054 targetsize);
4055 p += targetsize;
4056 extrachars -= targetsize;
4057 }
4058 /* 1-0 mapping: skip the character */
4059 }
4060 else {
4061 /* wrong return value */
4062 PyErr_SetString(PyExc_TypeError,
4063 "character mapping must return integer, None or unicode");
4064 Py_DECREF(x);
4065 goto onError;
4066 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004067 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004068 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004069 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004070 }
4071 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00004072 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004073 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004074 Py_XDECREF(errorHandler);
4075 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004076 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004077
Guido van Rossumd57fd912000-03-10 22:53:23 +00004078 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004079 Py_XDECREF(errorHandler);
4080 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004081 Py_XDECREF(v);
4082 return NULL;
4083}
4084
Martin v. Löwis3f767792006-06-04 19:36:28 +00004085/* Charmap encoding: the lookup table */
4086
4087struct encoding_map{
4088 PyObject_HEAD
4089 unsigned char level1[32];
4090 int count2, count3;
4091 unsigned char level23[1];
4092};
4093
4094static PyObject*
4095encoding_map_size(PyObject *obj, PyObject* args)
4096{
4097 struct encoding_map *map = (struct encoding_map*)obj;
4098 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4099 128*map->count3);
4100}
4101
4102static PyMethodDef encoding_map_methods[] = {
4103 {"size", encoding_map_size, METH_NOARGS,
4104 PyDoc_STR("Return the size (in bytes) of this object") },
4105 { 0 }
4106};
4107
4108static void
4109encoding_map_dealloc(PyObject* o)
4110{
4111 PyObject_FREE(o);
4112}
4113
4114static PyTypeObject EncodingMapType = {
Martin v. Löwis68192102007-07-21 06:55:02 +00004115 PyVarObject_HEAD_INIT(NULL, 0)
Martin v. Löwis3f767792006-06-04 19:36:28 +00004116 "EncodingMap", /*tp_name*/
4117 sizeof(struct encoding_map), /*tp_basicsize*/
4118 0, /*tp_itemsize*/
4119 /* methods */
4120 encoding_map_dealloc, /*tp_dealloc*/
4121 0, /*tp_print*/
4122 0, /*tp_getattr*/
4123 0, /*tp_setattr*/
4124 0, /*tp_compare*/
4125 0, /*tp_repr*/
4126 0, /*tp_as_number*/
4127 0, /*tp_as_sequence*/
4128 0, /*tp_as_mapping*/
4129 0, /*tp_hash*/
4130 0, /*tp_call*/
4131 0, /*tp_str*/
4132 0, /*tp_getattro*/
4133 0, /*tp_setattro*/
4134 0, /*tp_as_buffer*/
4135 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4136 0, /*tp_doc*/
4137 0, /*tp_traverse*/
4138 0, /*tp_clear*/
4139 0, /*tp_richcompare*/
4140 0, /*tp_weaklistoffset*/
4141 0, /*tp_iter*/
4142 0, /*tp_iternext*/
4143 encoding_map_methods, /*tp_methods*/
4144 0, /*tp_members*/
4145 0, /*tp_getset*/
4146 0, /*tp_base*/
4147 0, /*tp_dict*/
4148 0, /*tp_descr_get*/
4149 0, /*tp_descr_set*/
4150 0, /*tp_dictoffset*/
4151 0, /*tp_init*/
4152 0, /*tp_alloc*/
4153 0, /*tp_new*/
4154 0, /*tp_free*/
4155 0, /*tp_is_gc*/
4156};
4157
4158PyObject*
4159PyUnicode_BuildEncodingMap(PyObject* string)
4160{
4161 Py_UNICODE *decode;
4162 PyObject *result;
4163 struct encoding_map *mresult;
4164 int i;
4165 int need_dict = 0;
4166 unsigned char level1[32];
4167 unsigned char level2[512];
4168 unsigned char *mlevel1, *mlevel2, *mlevel3;
4169 int count2 = 0, count3 = 0;
4170
4171 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4172 PyErr_BadArgument();
4173 return NULL;
4174 }
4175 decode = PyUnicode_AS_UNICODE(string);
4176 memset(level1, 0xFF, sizeof level1);
4177 memset(level2, 0xFF, sizeof level2);
4178
4179 /* If there isn't a one-to-one mapping of NULL to \0,
4180 or if there are non-BMP characters, we need to use
4181 a mapping dictionary. */
4182 if (decode[0] != 0)
4183 need_dict = 1;
4184 for (i = 1; i < 256; i++) {
4185 int l1, l2;
4186 if (decode[i] == 0
4187 #ifdef Py_UNICODE_WIDE
4188 || decode[i] > 0xFFFF
4189 #endif
4190 ) {
4191 need_dict = 1;
4192 break;
4193 }
4194 if (decode[i] == 0xFFFE)
4195 /* unmapped character */
4196 continue;
4197 l1 = decode[i] >> 11;
4198 l2 = decode[i] >> 7;
4199 if (level1[l1] == 0xFF)
4200 level1[l1] = count2++;
4201 if (level2[l2] == 0xFF)
4202 level2[l2] = count3++;
4203 }
4204
4205 if (count2 >= 0xFF || count3 >= 0xFF)
4206 need_dict = 1;
4207
4208 if (need_dict) {
4209 PyObject *result = PyDict_New();
4210 PyObject *key, *value;
4211 if (!result)
4212 return NULL;
4213 for (i = 0; i < 256; i++) {
4214 key = value = NULL;
4215 key = PyInt_FromLong(decode[i]);
4216 value = PyInt_FromLong(i);
4217 if (!key || !value)
4218 goto failed1;
4219 if (PyDict_SetItem(result, key, value) == -1)
4220 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00004221 Py_DECREF(key);
4222 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004223 }
4224 return result;
4225 failed1:
4226 Py_XDECREF(key);
4227 Py_XDECREF(value);
4228 Py_DECREF(result);
4229 return NULL;
4230 }
4231
4232 /* Create a three-level trie */
4233 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4234 16*count2 + 128*count3 - 1);
4235 if (!result)
4236 return PyErr_NoMemory();
4237 PyObject_Init(result, &EncodingMapType);
4238 mresult = (struct encoding_map*)result;
4239 mresult->count2 = count2;
4240 mresult->count3 = count3;
4241 mlevel1 = mresult->level1;
4242 mlevel2 = mresult->level23;
4243 mlevel3 = mresult->level23 + 16*count2;
4244 memcpy(mlevel1, level1, 32);
4245 memset(mlevel2, 0xFF, 16*count2);
4246 memset(mlevel3, 0, 128*count3);
4247 count3 = 0;
4248 for (i = 1; i < 256; i++) {
4249 int o1, o2, o3, i2, i3;
4250 if (decode[i] == 0xFFFE)
4251 /* unmapped character */
4252 continue;
4253 o1 = decode[i]>>11;
4254 o2 = (decode[i]>>7) & 0xF;
4255 i2 = 16*mlevel1[o1] + o2;
4256 if (mlevel2[i2] == 0xFF)
4257 mlevel2[i2] = count3++;
4258 o3 = decode[i] & 0x7F;
4259 i3 = 128*mlevel2[i2] + o3;
4260 mlevel3[i3] = i;
4261 }
4262 return result;
4263}
4264
4265static int
4266encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4267{
4268 struct encoding_map *map = (struct encoding_map*)mapping;
4269 int l1 = c>>11;
4270 int l2 = (c>>7) & 0xF;
4271 int l3 = c & 0x7F;
4272 int i;
4273
4274#ifdef Py_UNICODE_WIDE
4275 if (c > 0xFFFF) {
4276 return -1;
4277 }
4278#endif
4279 if (c == 0)
4280 return 0;
4281 /* level 1*/
4282 i = map->level1[l1];
4283 if (i == 0xFF) {
4284 return -1;
4285 }
4286 /* level 2*/
4287 i = map->level23[16*i+l2];
4288 if (i == 0xFF) {
4289 return -1;
4290 }
4291 /* level 3 */
4292 i = map->level23[16*map->count2 + 128*i + l3];
4293 if (i == 0) {
4294 return -1;
4295 }
4296 return i;
4297}
4298
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004299/* Lookup the character ch in the mapping. If the character
4300 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004301 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004302static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004303{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004304 PyObject *w = PyInt_FromLong((long)c);
4305 PyObject *x;
4306
4307 if (w == NULL)
4308 return NULL;
4309 x = PyObject_GetItem(mapping, w);
4310 Py_DECREF(w);
4311 if (x == NULL) {
4312 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4313 /* No mapping found means: mapping is undefined. */
4314 PyErr_Clear();
4315 x = Py_None;
4316 Py_INCREF(x);
4317 return x;
4318 } else
4319 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004320 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004321 else if (x == Py_None)
4322 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004323 else if (PyInt_Check(x)) {
4324 long value = PyInt_AS_LONG(x);
4325 if (value < 0 || value > 255) {
4326 PyErr_SetString(PyExc_TypeError,
4327 "character mapping must be in range(256)");
4328 Py_DECREF(x);
4329 return NULL;
4330 }
4331 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004332 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004333 else if (PyString_Check(x))
4334 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004335 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004336 /* wrong return value */
4337 PyErr_SetString(PyExc_TypeError,
4338 "character mapping must return integer, None or str");
4339 Py_DECREF(x);
4340 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004341 }
4342}
4343
Martin v. Löwis3f767792006-06-04 19:36:28 +00004344static int
4345charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4346{
4347 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4348 /* exponentially overallocate to minimize reallocations */
4349 if (requiredsize < 2*outsize)
4350 requiredsize = 2*outsize;
4351 if (_PyString_Resize(outobj, requiredsize)) {
4352 return 0;
4353 }
4354 return 1;
4355}
4356
4357typedef enum charmapencode_result {
4358 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4359}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004360/* lookup the character, put the result in the output string and adjust
4361 various state variables. Reallocate the output string if not enough
4362 space is available. Return a new reference to the object that
4363 was put in the output buffer, or Py_None, if the mapping was undefined
4364 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004365 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004366static
Martin v. Löwis3f767792006-06-04 19:36:28 +00004367charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004368 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004369{
Martin v. Löwis3f767792006-06-04 19:36:28 +00004370 PyObject *rep;
4371 char *outstart;
4372 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004373
Christian Heimese93237d2007-12-19 02:37:44 +00004374 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004375 int res = encoding_map_lookup(c, mapping);
4376 Py_ssize_t requiredsize = *outpos+1;
4377 if (res == -1)
4378 return enc_FAILED;
4379 if (outsize<requiredsize)
4380 if (!charmapencode_resize(outobj, outpos, requiredsize))
4381 return enc_EXCEPTION;
4382 outstart = PyString_AS_STRING(*outobj);
4383 outstart[(*outpos)++] = (char)res;
4384 return enc_SUCCESS;
4385 }
4386
4387 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004388 if (rep==NULL)
Martin v. Löwis3f767792006-06-04 19:36:28 +00004389 return enc_EXCEPTION;
4390 else if (rep==Py_None) {
4391 Py_DECREF(rep);
4392 return enc_FAILED;
4393 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004394 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004395 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004396 if (outsize<requiredsize)
4397 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004398 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004399 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004400 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004401 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004402 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4403 }
4404 else {
4405 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004406 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4407 Py_ssize_t requiredsize = *outpos+repsize;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004408 if (outsize<requiredsize)
4409 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004410 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004411 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004412 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004413 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004414 memcpy(outstart + *outpos, repchars, repsize);
4415 *outpos += repsize;
4416 }
4417 }
Georg Brandl9f167602006-06-04 21:46:16 +00004418 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004419 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004420}
4421
4422/* handle an error in PyUnicode_EncodeCharmap
4423 Return 0 on success, -1 on error */
4424static
4425int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004426 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004427 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004428 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004429 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004430{
4431 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004432 Py_ssize_t repsize;
4433 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004434 Py_UNICODE *uni2;
4435 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004436 Py_ssize_t collstartpos = *inpos;
4437 Py_ssize_t collendpos = *inpos+1;
4438 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004439 char *encoding = "charmap";
4440 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00004441 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004442
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004443 /* find all unencodable characters */
4444 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004445 PyObject *rep;
Christian Heimese93237d2007-12-19 02:37:44 +00004446 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004447 int res = encoding_map_lookup(p[collendpos], mapping);
4448 if (res != -1)
4449 break;
4450 ++collendpos;
4451 continue;
4452 }
4453
4454 rep = charmapencode_lookup(p[collendpos], mapping);
4455 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004456 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004457 else if (rep!=Py_None) {
4458 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004459 break;
4460 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004461 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004462 ++collendpos;
4463 }
4464 /* cache callback name lookup
4465 * (if not done yet, i.e. it's the first error) */
4466 if (*known_errorHandler==-1) {
4467 if ((errors==NULL) || (!strcmp(errors, "strict")))
4468 *known_errorHandler = 1;
4469 else if (!strcmp(errors, "replace"))
4470 *known_errorHandler = 2;
4471 else if (!strcmp(errors, "ignore"))
4472 *known_errorHandler = 3;
4473 else if (!strcmp(errors, "xmlcharrefreplace"))
4474 *known_errorHandler = 4;
4475 else
4476 *known_errorHandler = 0;
4477 }
4478 switch (*known_errorHandler) {
4479 case 1: /* strict */
4480 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4481 return -1;
4482 case 2: /* replace */
4483 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4484 x = charmapencode_output('?', mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004485 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004486 return -1;
4487 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004488 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004489 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4490 return -1;
4491 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004492 }
4493 /* fall through */
4494 case 3: /* ignore */
4495 *inpos = collendpos;
4496 break;
4497 case 4: /* xmlcharrefreplace */
4498 /* generate replacement (temporarily (mis)uses p) */
4499 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4500 char buffer[2+29+1+1];
4501 char *cp;
4502 sprintf(buffer, "&#%d;", (int)p[collpos]);
4503 for (cp = buffer; *cp; ++cp) {
4504 x = charmapencode_output(*cp, mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004505 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004506 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004507 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004508 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4509 return -1;
4510 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004511 }
4512 }
4513 *inpos = collendpos;
4514 break;
4515 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004516 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004517 encoding, reason, p, size, exceptionObject,
4518 collstartpos, collendpos, &newpos);
4519 if (repunicode == NULL)
4520 return -1;
4521 /* generate replacement */
4522 repsize = PyUnicode_GET_SIZE(repunicode);
4523 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4524 x = charmapencode_output(*uni2, mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004525 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004526 return -1;
4527 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004528 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004529 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004530 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4531 return -1;
4532 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004533 }
4534 *inpos = newpos;
4535 Py_DECREF(repunicode);
4536 }
4537 return 0;
4538}
4539
Guido van Rossumd57fd912000-03-10 22:53:23 +00004540PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004541 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004542 PyObject *mapping,
4543 const char *errors)
4544{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004545 /* output object */
4546 PyObject *res = NULL;
4547 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004548 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004549 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004550 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004551 PyObject *errorHandler = NULL;
4552 PyObject *exc = NULL;
4553 /* the following variable is used for caching string comparisons
4554 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4555 * 3=ignore, 4=xmlcharrefreplace */
4556 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004557
4558 /* Default to Latin-1 */
4559 if (mapping == NULL)
4560 return PyUnicode_EncodeLatin1(p, size, errors);
4561
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004562 /* allocate enough for a simple encoding without
4563 replacements, if we need more, we'll resize */
4564 res = PyString_FromStringAndSize(NULL, size);
4565 if (res == NULL)
4566 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004567 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004568 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004569
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004570 while (inpos<size) {
4571 /* try to encode it */
Martin v. Löwis3f767792006-06-04 19:36:28 +00004572 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4573 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004574 goto onError;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004575 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004576 if (charmap_encoding_error(p, size, &inpos, mapping,
4577 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004578 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00004579 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004580 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004581 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004582 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004583 else
4584 /* done with this character => adjust input position */
4585 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004586 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004587
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004588 /* Resize if we allocated to much */
4589 if (respos<PyString_GET_SIZE(res)) {
4590 if (_PyString_Resize(&res, respos))
4591 goto onError;
4592 }
4593 Py_XDECREF(exc);
4594 Py_XDECREF(errorHandler);
4595 return res;
4596
4597 onError:
4598 Py_XDECREF(res);
4599 Py_XDECREF(exc);
4600 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004601 return NULL;
4602}
4603
4604PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4605 PyObject *mapping)
4606{
4607 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4608 PyErr_BadArgument();
4609 return NULL;
4610 }
4611 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4612 PyUnicode_GET_SIZE(unicode),
4613 mapping,
4614 NULL);
4615}
4616
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004617/* create or adjust a UnicodeTranslateError */
4618static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004619 const Py_UNICODE *unicode, Py_ssize_t size,
4620 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004621 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004622{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004623 if (*exceptionObject == NULL) {
4624 *exceptionObject = PyUnicodeTranslateError_Create(
4625 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004626 }
4627 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004628 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4629 goto onError;
4630 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4631 goto onError;
4632 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4633 goto onError;
4634 return;
4635 onError:
4636 Py_DECREF(*exceptionObject);
4637 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004638 }
4639}
4640
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004641/* raises a UnicodeTranslateError */
4642static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004643 const Py_UNICODE *unicode, Py_ssize_t size,
4644 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004645 const char *reason)
4646{
4647 make_translate_exception(exceptionObject,
4648 unicode, size, startpos, endpos, reason);
4649 if (*exceptionObject != NULL)
4650 PyCodec_StrictErrors(*exceptionObject);
4651}
4652
4653/* error handling callback helper:
4654 build arguments, call the callback and check the arguments,
4655 put the result into newpos and return the replacement string, which
4656 has to be freed by the caller */
4657static PyObject *unicode_translate_call_errorhandler(const char *errors,
4658 PyObject **errorHandler,
4659 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004660 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4661 Py_ssize_t startpos, Py_ssize_t endpos,
4662 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004663{
Martin v. Löwis412fb672006-04-13 06:34:32 +00004664 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004665
Martin v. Löwis412fb672006-04-13 06:34:32 +00004666 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004667 PyObject *restuple;
4668 PyObject *resunicode;
4669
4670 if (*errorHandler == NULL) {
4671 *errorHandler = PyCodec_LookupError(errors);
4672 if (*errorHandler == NULL)
4673 return NULL;
4674 }
4675
4676 make_translate_exception(exceptionObject,
4677 unicode, size, startpos, endpos, reason);
4678 if (*exceptionObject == NULL)
4679 return NULL;
4680
4681 restuple = PyObject_CallFunctionObjArgs(
4682 *errorHandler, *exceptionObject, NULL);
4683 if (restuple == NULL)
4684 return NULL;
4685 if (!PyTuple_Check(restuple)) {
4686 PyErr_Format(PyExc_TypeError, &argparse[4]);
4687 Py_DECREF(restuple);
4688 return NULL;
4689 }
4690 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004691 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004692 Py_DECREF(restuple);
4693 return NULL;
4694 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004695 if (i_newpos<0)
4696 *newpos = size+i_newpos;
4697 else
4698 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004699 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004700 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004701 Py_DECREF(restuple);
4702 return NULL;
4703 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004704 Py_INCREF(resunicode);
4705 Py_DECREF(restuple);
4706 return resunicode;
4707}
4708
4709/* Lookup the character ch in the mapping and put the result in result,
4710 which must be decrefed by the caller.
4711 Return 0 on success, -1 on error */
4712static
4713int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4714{
4715 PyObject *w = PyInt_FromLong((long)c);
4716 PyObject *x;
4717
4718 if (w == NULL)
4719 return -1;
4720 x = PyObject_GetItem(mapping, w);
4721 Py_DECREF(w);
4722 if (x == NULL) {
4723 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4724 /* No mapping found means: use 1:1 mapping. */
4725 PyErr_Clear();
4726 *result = NULL;
4727 return 0;
4728 } else
4729 return -1;
4730 }
4731 else if (x == Py_None) {
4732 *result = x;
4733 return 0;
4734 }
4735 else if (PyInt_Check(x)) {
4736 long value = PyInt_AS_LONG(x);
4737 long max = PyUnicode_GetMax();
4738 if (value < 0 || value > max) {
4739 PyErr_Format(PyExc_TypeError,
4740 "character mapping must be in range(0x%lx)", max+1);
4741 Py_DECREF(x);
4742 return -1;
4743 }
4744 *result = x;
4745 return 0;
4746 }
4747 else if (PyUnicode_Check(x)) {
4748 *result = x;
4749 return 0;
4750 }
4751 else {
4752 /* wrong return value */
4753 PyErr_SetString(PyExc_TypeError,
4754 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004755 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004756 return -1;
4757 }
4758}
4759/* ensure that *outobj is at least requiredsize characters long,
4760if not reallocate and adjust various state variables.
4761Return 0 on success, -1 on error */
4762static
Walter Dörwald4894c302003-10-24 14:25:28 +00004763int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004764 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004765{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004766 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004767 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004768 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004769 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004770 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004771 if (requiredsize < 2 * oldsize)
4772 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004773 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004774 return -1;
4775 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004776 }
4777 return 0;
4778}
4779/* lookup the character, put the result in the output string and adjust
4780 various state variables. Return a new reference to the object that
4781 was put in the output buffer in *result, or Py_None, if the mapping was
4782 undefined (in which case no character was written).
4783 The called must decref result.
4784 Return 0 on success, -1 on error. */
4785static
Walter Dörwald4894c302003-10-24 14:25:28 +00004786int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004787 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004788 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004789{
Walter Dörwald4894c302003-10-24 14:25:28 +00004790 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004791 return -1;
4792 if (*res==NULL) {
4793 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004794 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004795 }
4796 else if (*res==Py_None)
4797 ;
4798 else if (PyInt_Check(*res)) {
4799 /* no overflow check, because we know that the space is enough */
4800 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4801 }
4802 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004803 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004804 if (repsize==1) {
4805 /* no overflow check, because we know that the space is enough */
4806 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4807 }
4808 else if (repsize!=0) {
4809 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004810 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004811 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004812 repsize - 1;
4813 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004814 return -1;
4815 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4816 *outp += repsize;
4817 }
4818 }
4819 else
4820 return -1;
4821 return 0;
4822}
4823
4824PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004825 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004826 PyObject *mapping,
4827 const char *errors)
4828{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004829 /* output object */
4830 PyObject *res = NULL;
4831 /* pointers to the beginning and end+1 of input */
4832 const Py_UNICODE *startp = p;
4833 const Py_UNICODE *endp = p + size;
4834 /* pointer into the output */
4835 Py_UNICODE *str;
4836 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004837 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004838 char *reason = "character maps to <undefined>";
4839 PyObject *errorHandler = NULL;
4840 PyObject *exc = NULL;
4841 /* the following variable is used for caching string comparisons
4842 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4843 * 3=ignore, 4=xmlcharrefreplace */
4844 int known_errorHandler = -1;
4845
Guido van Rossumd57fd912000-03-10 22:53:23 +00004846 if (mapping == NULL) {
4847 PyErr_BadArgument();
4848 return NULL;
4849 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004850
4851 /* allocate enough for a simple 1:1 translation without
4852 replacements, if we need more, we'll resize */
4853 res = PyUnicode_FromUnicode(NULL, size);
4854 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004855 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004856 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004857 return res;
4858 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004859
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004860 while (p<endp) {
4861 /* try to encode it */
4862 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004863 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004864 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004865 goto onError;
4866 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004867 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004868 if (x!=Py_None) /* it worked => adjust input pointer */
4869 ++p;
4870 else { /* untranslatable character */
4871 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004872 Py_ssize_t repsize;
4873 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004874 Py_UNICODE *uni2;
4875 /* startpos for collecting untranslatable chars */
4876 const Py_UNICODE *collstart = p;
4877 const Py_UNICODE *collend = p+1;
4878 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004879
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004880 /* find all untranslatable characters */
4881 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004882 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004883 goto onError;
4884 Py_XDECREF(x);
4885 if (x!=Py_None)
4886 break;
4887 ++collend;
4888 }
4889 /* cache callback name lookup
4890 * (if not done yet, i.e. it's the first error) */
4891 if (known_errorHandler==-1) {
4892 if ((errors==NULL) || (!strcmp(errors, "strict")))
4893 known_errorHandler = 1;
4894 else if (!strcmp(errors, "replace"))
4895 known_errorHandler = 2;
4896 else if (!strcmp(errors, "ignore"))
4897 known_errorHandler = 3;
4898 else if (!strcmp(errors, "xmlcharrefreplace"))
4899 known_errorHandler = 4;
4900 else
4901 known_errorHandler = 0;
4902 }
4903 switch (known_errorHandler) {
4904 case 1: /* strict */
4905 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4906 goto onError;
4907 case 2: /* replace */
4908 /* No need to check for space, this is a 1:1 replacement */
4909 for (coll = collstart; coll<collend; ++coll)
4910 *str++ = '?';
4911 /* fall through */
4912 case 3: /* ignore */
4913 p = collend;
4914 break;
4915 case 4: /* xmlcharrefreplace */
4916 /* generate replacement (temporarily (mis)uses p) */
4917 for (p = collstart; p < collend; ++p) {
4918 char buffer[2+29+1+1];
4919 char *cp;
4920 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004921 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004922 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4923 goto onError;
4924 for (cp = buffer; *cp; ++cp)
4925 *str++ = *cp;
4926 }
4927 p = collend;
4928 break;
4929 default:
4930 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4931 reason, startp, size, &exc,
4932 collstart-startp, collend-startp, &newpos);
4933 if (repunicode == NULL)
4934 goto onError;
4935 /* generate replacement */
4936 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004937 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004938 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4939 Py_DECREF(repunicode);
4940 goto onError;
4941 }
4942 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4943 *str++ = *uni2;
4944 p = startp + newpos;
4945 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004946 }
4947 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004948 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004949 /* Resize if we allocated to much */
4950 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004951 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004952 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004953 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004954 }
4955 Py_XDECREF(exc);
4956 Py_XDECREF(errorHandler);
4957 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004958
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004959 onError:
4960 Py_XDECREF(res);
4961 Py_XDECREF(exc);
4962 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004963 return NULL;
4964}
4965
4966PyObject *PyUnicode_Translate(PyObject *str,
4967 PyObject *mapping,
4968 const char *errors)
4969{
4970 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004971
Guido van Rossumd57fd912000-03-10 22:53:23 +00004972 str = PyUnicode_FromObject(str);
4973 if (str == NULL)
4974 goto onError;
4975 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4976 PyUnicode_GET_SIZE(str),
4977 mapping,
4978 errors);
4979 Py_DECREF(str);
4980 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004981
Guido van Rossumd57fd912000-03-10 22:53:23 +00004982 onError:
4983 Py_XDECREF(str);
4984 return NULL;
4985}
Tim Petersced69f82003-09-16 20:30:58 +00004986
Guido van Rossum9e896b32000-04-05 20:11:21 +00004987/* --- Decimal Encoder ---------------------------------------------------- */
4988
4989int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004990 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004991 char *output,
4992 const char *errors)
4993{
4994 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004995 PyObject *errorHandler = NULL;
4996 PyObject *exc = NULL;
4997 const char *encoding = "decimal";
4998 const char *reason = "invalid decimal Unicode string";
4999 /* the following variable is used for caching string comparisons
5000 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5001 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005002
5003 if (output == NULL) {
5004 PyErr_BadArgument();
5005 return -1;
5006 }
5007
5008 p = s;
5009 end = s + length;
5010 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005011 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005012 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005013 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005014 Py_ssize_t repsize;
5015 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005016 Py_UNICODE *uni2;
5017 Py_UNICODE *collstart;
5018 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005019
Guido van Rossum9e896b32000-04-05 20:11:21 +00005020 if (Py_UNICODE_ISSPACE(ch)) {
5021 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005022 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005023 continue;
5024 }
5025 decimal = Py_UNICODE_TODECIMAL(ch);
5026 if (decimal >= 0) {
5027 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005028 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005029 continue;
5030 }
Guido van Rossumba477042000-04-06 18:18:10 +00005031 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00005032 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005033 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005034 continue;
5035 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005036 /* All other characters are considered unencodable */
5037 collstart = p;
5038 collend = p+1;
5039 while (collend < end) {
5040 if ((0 < *collend && *collend < 256) ||
5041 !Py_UNICODE_ISSPACE(*collend) ||
5042 Py_UNICODE_TODECIMAL(*collend))
5043 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005044 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005045 /* cache callback name lookup
5046 * (if not done yet, i.e. it's the first error) */
5047 if (known_errorHandler==-1) {
5048 if ((errors==NULL) || (!strcmp(errors, "strict")))
5049 known_errorHandler = 1;
5050 else if (!strcmp(errors, "replace"))
5051 known_errorHandler = 2;
5052 else if (!strcmp(errors, "ignore"))
5053 known_errorHandler = 3;
5054 else if (!strcmp(errors, "xmlcharrefreplace"))
5055 known_errorHandler = 4;
5056 else
5057 known_errorHandler = 0;
5058 }
5059 switch (known_errorHandler) {
5060 case 1: /* strict */
5061 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5062 goto onError;
5063 case 2: /* replace */
5064 for (p = collstart; p < collend; ++p)
5065 *output++ = '?';
5066 /* fall through */
5067 case 3: /* ignore */
5068 p = collend;
5069 break;
5070 case 4: /* xmlcharrefreplace */
5071 /* generate replacement (temporarily (mis)uses p) */
5072 for (p = collstart; p < collend; ++p)
5073 output += sprintf(output, "&#%d;", (int)*p);
5074 p = collend;
5075 break;
5076 default:
5077 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5078 encoding, reason, s, length, &exc,
5079 collstart-s, collend-s, &newpos);
5080 if (repunicode == NULL)
5081 goto onError;
5082 /* generate replacement */
5083 repsize = PyUnicode_GET_SIZE(repunicode);
5084 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5085 Py_UNICODE ch = *uni2;
5086 if (Py_UNICODE_ISSPACE(ch))
5087 *output++ = ' ';
5088 else {
5089 decimal = Py_UNICODE_TODECIMAL(ch);
5090 if (decimal >= 0)
5091 *output++ = '0' + decimal;
5092 else if (0 < ch && ch < 256)
5093 *output++ = (char)ch;
5094 else {
5095 Py_DECREF(repunicode);
5096 raise_encode_exception(&exc, encoding,
5097 s, length, collstart-s, collend-s, reason);
5098 goto onError;
5099 }
5100 }
5101 }
5102 p = s + newpos;
5103 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005104 }
5105 }
5106 /* 0-terminate the output string */
5107 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005108 Py_XDECREF(exc);
5109 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005110 return 0;
5111
5112 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005113 Py_XDECREF(exc);
5114 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005115 return -1;
5116}
5117
Guido van Rossumd57fd912000-03-10 22:53:23 +00005118/* --- Helpers ------------------------------------------------------------ */
5119
Eric Smitha9f7d622008-02-17 19:46:49 +00005120#include "stringlib/unicodedefs.h"
Fredrik Lundh6471ee42006-05-24 14:28:11 +00005121
Facundo Batista6f7e6fb2007-11-16 19:16:15 +00005122#define FROM_UNICODE
Fredrik Lundhb9479482006-05-26 17:22:38 +00005123
Fredrik Lundha50d2012006-05-26 17:04:58 +00005124#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005125
5126#include "stringlib/count.h"
5127#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005128#include "stringlib/partition.h"
5129
Fredrik Lundhc8162812006-05-26 19:33:03 +00005130/* helper macro to fixup start/end slice values */
5131#define FIX_START_END(obj) \
5132 if (start < 0) \
5133 start += (obj)->length; \
5134 if (start < 0) \
5135 start = 0; \
5136 if (end > (obj)->length) \
5137 end = (obj)->length; \
5138 if (end < 0) \
5139 end += (obj)->length; \
5140 if (end < 0) \
5141 end = 0;
5142
Martin v. Löwis18e16552006-02-15 17:27:45 +00005143Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005144 PyObject *substr,
5145 Py_ssize_t start,
5146 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005147{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005148 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005149 PyUnicodeObject* str_obj;
5150 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005151
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005152 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5153 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005154 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005155 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5156 if (!sub_obj) {
5157 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005158 return -1;
5159 }
Tim Petersced69f82003-09-16 20:30:58 +00005160
Fredrik Lundhc8162812006-05-26 19:33:03 +00005161 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005162
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005163 result = stringlib_count(
5164 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5165 );
5166
5167 Py_DECREF(sub_obj);
5168 Py_DECREF(str_obj);
5169
Guido van Rossumd57fd912000-03-10 22:53:23 +00005170 return result;
5171}
5172
Martin v. Löwis18e16552006-02-15 17:27:45 +00005173Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005174 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005175 Py_ssize_t start,
5176 Py_ssize_t end,
5177 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005178{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005179 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005180
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005181 str = PyUnicode_FromObject(str);
5182 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005183 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005184 sub = PyUnicode_FromObject(sub);
5185 if (!sub) {
5186 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005187 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005188 }
Tim Petersced69f82003-09-16 20:30:58 +00005189
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005190 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005191 result = stringlib_find_slice(
5192 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5193 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5194 start, end
5195 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005196 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005197 result = stringlib_rfind_slice(
5198 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5199 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5200 start, end
5201 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005202
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005203 Py_DECREF(str);
5204 Py_DECREF(sub);
5205
Guido van Rossumd57fd912000-03-10 22:53:23 +00005206 return result;
5207}
5208
Tim Petersced69f82003-09-16 20:30:58 +00005209static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005210int tailmatch(PyUnicodeObject *self,
5211 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005212 Py_ssize_t start,
5213 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005214 int direction)
5215{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005216 if (substring->length == 0)
5217 return 1;
5218
Fredrik Lundhc8162812006-05-26 19:33:03 +00005219 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005220
5221 end -= substring->length;
5222 if (end < start)
5223 return 0;
5224
5225 if (direction > 0) {
5226 if (Py_UNICODE_MATCH(self, end, substring))
5227 return 1;
5228 } else {
5229 if (Py_UNICODE_MATCH(self, start, substring))
5230 return 1;
5231 }
5232
5233 return 0;
5234}
5235
Martin v. Löwis18e16552006-02-15 17:27:45 +00005236Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005237 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005238 Py_ssize_t start,
5239 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005240 int direction)
5241{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005242 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005243
Guido van Rossumd57fd912000-03-10 22:53:23 +00005244 str = PyUnicode_FromObject(str);
5245 if (str == NULL)
5246 return -1;
5247 substr = PyUnicode_FromObject(substr);
5248 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005249 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005250 return -1;
5251 }
Tim Petersced69f82003-09-16 20:30:58 +00005252
Guido van Rossumd57fd912000-03-10 22:53:23 +00005253 result = tailmatch((PyUnicodeObject *)str,
5254 (PyUnicodeObject *)substr,
5255 start, end, direction);
5256 Py_DECREF(str);
5257 Py_DECREF(substr);
5258 return result;
5259}
5260
Guido van Rossumd57fd912000-03-10 22:53:23 +00005261/* Apply fixfct filter to the Unicode object self and return a
5262 reference to the modified object */
5263
Tim Petersced69f82003-09-16 20:30:58 +00005264static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005265PyObject *fixup(PyUnicodeObject *self,
5266 int (*fixfct)(PyUnicodeObject *s))
5267{
5268
5269 PyUnicodeObject *u;
5270
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005271 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005272 if (u == NULL)
5273 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005274
5275 Py_UNICODE_COPY(u->str, self->str, self->length);
5276
Tim Peters7a29bd52001-09-12 03:03:31 +00005277 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005278 /* fixfct should return TRUE if it modified the buffer. If
5279 FALSE, return a reference to the original buffer instead
5280 (to save space, not time) */
5281 Py_INCREF(self);
5282 Py_DECREF(u);
5283 return (PyObject*) self;
5284 }
5285 return (PyObject*) u;
5286}
5287
Tim Petersced69f82003-09-16 20:30:58 +00005288static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005289int fixupper(PyUnicodeObject *self)
5290{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005291 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005292 Py_UNICODE *s = self->str;
5293 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005294
Guido van Rossumd57fd912000-03-10 22:53:23 +00005295 while (len-- > 0) {
5296 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005297
Guido van Rossumd57fd912000-03-10 22:53:23 +00005298 ch = Py_UNICODE_TOUPPER(*s);
5299 if (ch != *s) {
5300 status = 1;
5301 *s = ch;
5302 }
5303 s++;
5304 }
5305
5306 return status;
5307}
5308
Tim Petersced69f82003-09-16 20:30:58 +00005309static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005310int fixlower(PyUnicodeObject *self)
5311{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005312 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005313 Py_UNICODE *s = self->str;
5314 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005315
Guido van Rossumd57fd912000-03-10 22:53:23 +00005316 while (len-- > 0) {
5317 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005318
Guido van Rossumd57fd912000-03-10 22:53:23 +00005319 ch = Py_UNICODE_TOLOWER(*s);
5320 if (ch != *s) {
5321 status = 1;
5322 *s = ch;
5323 }
5324 s++;
5325 }
5326
5327 return status;
5328}
5329
Tim Petersced69f82003-09-16 20:30:58 +00005330static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005331int fixswapcase(PyUnicodeObject *self)
5332{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005333 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005334 Py_UNICODE *s = self->str;
5335 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005336
Guido van Rossumd57fd912000-03-10 22:53:23 +00005337 while (len-- > 0) {
5338 if (Py_UNICODE_ISUPPER(*s)) {
5339 *s = Py_UNICODE_TOLOWER(*s);
5340 status = 1;
5341 } else if (Py_UNICODE_ISLOWER(*s)) {
5342 *s = Py_UNICODE_TOUPPER(*s);
5343 status = 1;
5344 }
5345 s++;
5346 }
5347
5348 return status;
5349}
5350
Tim Petersced69f82003-09-16 20:30:58 +00005351static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005352int fixcapitalize(PyUnicodeObject *self)
5353{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005354 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005355 Py_UNICODE *s = self->str;
5356 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005357
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005358 if (len == 0)
5359 return 0;
5360 if (Py_UNICODE_ISLOWER(*s)) {
5361 *s = Py_UNICODE_TOUPPER(*s);
5362 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005363 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005364 s++;
5365 while (--len > 0) {
5366 if (Py_UNICODE_ISUPPER(*s)) {
5367 *s = Py_UNICODE_TOLOWER(*s);
5368 status = 1;
5369 }
5370 s++;
5371 }
5372 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005373}
5374
5375static
5376int fixtitle(PyUnicodeObject *self)
5377{
5378 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5379 register Py_UNICODE *e;
5380 int previous_is_cased;
5381
5382 /* Shortcut for single character strings */
5383 if (PyUnicode_GET_SIZE(self) == 1) {
5384 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5385 if (*p != ch) {
5386 *p = ch;
5387 return 1;
5388 }
5389 else
5390 return 0;
5391 }
Tim Petersced69f82003-09-16 20:30:58 +00005392
Guido van Rossumd57fd912000-03-10 22:53:23 +00005393 e = p + PyUnicode_GET_SIZE(self);
5394 previous_is_cased = 0;
5395 for (; p < e; p++) {
5396 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005397
Guido van Rossumd57fd912000-03-10 22:53:23 +00005398 if (previous_is_cased)
5399 *p = Py_UNICODE_TOLOWER(ch);
5400 else
5401 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005402
5403 if (Py_UNICODE_ISLOWER(ch) ||
5404 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005405 Py_UNICODE_ISTITLE(ch))
5406 previous_is_cased = 1;
5407 else
5408 previous_is_cased = 0;
5409 }
5410 return 1;
5411}
5412
Tim Peters8ce9f162004-08-27 01:49:32 +00005413PyObject *
5414PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005415{
Tim Peters8ce9f162004-08-27 01:49:32 +00005416 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005417 const Py_UNICODE blank = ' ';
5418 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00005419 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005420 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00005421 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5422 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005423 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5424 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005425 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005426 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005427 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005428
Tim Peters05eba1f2004-08-27 21:32:02 +00005429 fseq = PySequence_Fast(seq, "");
5430 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005431 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005432 }
5433
Tim Peters91879ab2004-08-27 22:35:44 +00005434 /* Grrrr. A codec may be invoked to convert str objects to
5435 * Unicode, and so it's possible to call back into Python code
5436 * during PyUnicode_FromObject(), and so it's possible for a sick
5437 * codec to change the size of fseq (if seq is a list). Therefore
5438 * we have to keep refetching the size -- can't assume seqlen
5439 * is invariant.
5440 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005441 seqlen = PySequence_Fast_GET_SIZE(fseq);
5442 /* If empty sequence, return u"". */
5443 if (seqlen == 0) {
5444 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5445 goto Done;
5446 }
5447 /* If singleton sequence with an exact Unicode, return that. */
5448 if (seqlen == 1) {
5449 item = PySequence_Fast_GET_ITEM(fseq, 0);
5450 if (PyUnicode_CheckExact(item)) {
5451 Py_INCREF(item);
5452 res = (PyUnicodeObject *)item;
5453 goto Done;
5454 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005455 }
5456
Tim Peters05eba1f2004-08-27 21:32:02 +00005457 /* At least two items to join, or one that isn't exact Unicode. */
5458 if (seqlen > 1) {
5459 /* Set up sep and seplen -- they're needed. */
5460 if (separator == NULL) {
5461 sep = &blank;
5462 seplen = 1;
5463 }
5464 else {
5465 internal_separator = PyUnicode_FromObject(separator);
5466 if (internal_separator == NULL)
5467 goto onError;
5468 sep = PyUnicode_AS_UNICODE(internal_separator);
5469 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00005470 /* In case PyUnicode_FromObject() mutated seq. */
5471 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005472 }
5473 }
5474
5475 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005476 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005477 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005478 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005479 res_p = PyUnicode_AS_UNICODE(res);
5480 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005481
Tim Peters05eba1f2004-08-27 21:32:02 +00005482 for (i = 0; i < seqlen; ++i) {
Tim Peters286085c2006-05-22 19:17:04 +00005483 Py_ssize_t itemlen;
5484 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005485
5486 item = PySequence_Fast_GET_ITEM(fseq, i);
5487 /* Convert item to Unicode. */
5488 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5489 PyErr_Format(PyExc_TypeError,
Thomas Wouters715a4cd2006-04-16 22:04:49 +00005490 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00005491 " %.80s found",
Christian Heimese93237d2007-12-19 02:37:44 +00005492 i, Py_TYPE(item)->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00005493 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005494 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005495 item = PyUnicode_FromObject(item);
5496 if (item == NULL)
5497 goto onError;
5498 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005499
Tim Peters91879ab2004-08-27 22:35:44 +00005500 /* In case PyUnicode_FromObject() mutated seq. */
5501 seqlen = PySequence_Fast_GET_SIZE(fseq);
5502
Tim Peters8ce9f162004-08-27 01:49:32 +00005503 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005504 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005505 new_res_used = res_used + itemlen;
Georg Brandl90e27d32006-06-10 06:40:50 +00005506 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005507 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005508 if (i < seqlen - 1) {
5509 new_res_used += seplen;
Georg Brandl90e27d32006-06-10 06:40:50 +00005510 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00005511 goto Overflow;
5512 }
5513 if (new_res_used > res_alloc) {
5514 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00005515 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00005516 res_alloc += res_alloc;
Tim Peters286085c2006-05-22 19:17:04 +00005517 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005518 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005519 } while (new_res_used > res_alloc);
Martin v. Löwis412fb672006-04-13 06:34:32 +00005520 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005521 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005522 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005523 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005524 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005525 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005526
5527 /* Copy item, and maybe the separator. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005528 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005529 res_p += itemlen;
5530 if (i < seqlen - 1) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00005531 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005532 res_p += seplen;
5533 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005534 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005535 res_used = new_res_used;
5536 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005537
Tim Peters05eba1f2004-08-27 21:32:02 +00005538 /* Shrink res to match the used area; this probably can't fail,
5539 * but it's cheap to check.
5540 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005541 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005542 goto onError;
5543
5544 Done:
5545 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005546 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005547 return (PyObject *)res;
5548
Tim Peters8ce9f162004-08-27 01:49:32 +00005549 Overflow:
5550 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00005551 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005552 Py_DECREF(item);
5553 /* fall through */
5554
Guido van Rossumd57fd912000-03-10 22:53:23 +00005555 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005556 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005557 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005558 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005559 return NULL;
5560}
5561
Tim Petersced69f82003-09-16 20:30:58 +00005562static
5563PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005564 Py_ssize_t left,
5565 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005566 Py_UNICODE fill)
5567{
5568 PyUnicodeObject *u;
5569
5570 if (left < 0)
5571 left = 0;
5572 if (right < 0)
5573 right = 0;
5574
Tim Peters7a29bd52001-09-12 03:03:31 +00005575 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005576 Py_INCREF(self);
5577 return self;
5578 }
5579
5580 u = _PyUnicode_New(left + self->length + right);
5581 if (u) {
5582 if (left)
5583 Py_UNICODE_FILL(u->str, fill, left);
5584 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5585 if (right)
5586 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5587 }
5588
5589 return u;
5590}
5591
5592#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005593 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005594 if (!str) \
5595 goto onError; \
5596 if (PyList_Append(list, str)) { \
5597 Py_DECREF(str); \
5598 goto onError; \
5599 } \
5600 else \
5601 Py_DECREF(str);
5602
5603static
5604PyObject *split_whitespace(PyUnicodeObject *self,
5605 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005606 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005607{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005608 register Py_ssize_t i;
5609 register Py_ssize_t j;
5610 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005611 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005612 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005613
5614 for (i = j = 0; i < len; ) {
5615 /* find a token */
Christian Heimes4d4f2702008-01-30 11:32:37 +00005616 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005617 i++;
5618 j = i;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005619 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005620 i++;
5621 if (j < i) {
5622 if (maxcount-- <= 0)
5623 break;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005624 SPLIT_APPEND(buf, j, i);
5625 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005626 i++;
5627 j = i;
5628 }
5629 }
5630 if (j < len) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005631 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005632 }
5633 return list;
5634
5635 onError:
5636 Py_DECREF(list);
5637 return NULL;
5638}
5639
5640PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005641 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005642{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005643 register Py_ssize_t i;
5644 register Py_ssize_t j;
5645 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005646 PyObject *list;
5647 PyObject *str;
5648 Py_UNICODE *data;
5649
5650 string = PyUnicode_FromObject(string);
5651 if (string == NULL)
5652 return NULL;
5653 data = PyUnicode_AS_UNICODE(string);
5654 len = PyUnicode_GET_SIZE(string);
5655
Guido van Rossumd57fd912000-03-10 22:53:23 +00005656 list = PyList_New(0);
5657 if (!list)
5658 goto onError;
5659
5660 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005661 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005662
Guido van Rossumd57fd912000-03-10 22:53:23 +00005663 /* Find a line and append it */
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005664 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005665 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005666
5667 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005668 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005669 if (i < len) {
5670 if (data[i] == '\r' && i + 1 < len &&
5671 data[i+1] == '\n')
5672 i += 2;
5673 else
5674 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005675 if (keepends)
5676 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005677 }
Guido van Rossum86662912000-04-11 15:38:46 +00005678 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005679 j = i;
5680 }
5681 if (j < len) {
5682 SPLIT_APPEND(data, j, len);
5683 }
5684
5685 Py_DECREF(string);
5686 return list;
5687
5688 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005689 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005690 Py_DECREF(string);
5691 return NULL;
5692}
5693
Tim Petersced69f82003-09-16 20:30:58 +00005694static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005695PyObject *split_char(PyUnicodeObject *self,
5696 PyObject *list,
5697 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005698 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005700 register Py_ssize_t i;
5701 register Py_ssize_t j;
5702 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005703 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005704 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005705
5706 for (i = j = 0; i < len; ) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005707 if (buf[i] == ch) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005708 if (maxcount-- <= 0)
5709 break;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005710 SPLIT_APPEND(buf, j, i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005711 i = j = i + 1;
5712 } else
5713 i++;
5714 }
5715 if (j <= len) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005716 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005717 }
5718 return list;
5719
5720 onError:
5721 Py_DECREF(list);
5722 return NULL;
5723}
5724
Tim Petersced69f82003-09-16 20:30:58 +00005725static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005726PyObject *split_substring(PyUnicodeObject *self,
5727 PyObject *list,
5728 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005729 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005730{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005731 register Py_ssize_t i;
5732 register Py_ssize_t j;
5733 Py_ssize_t len = self->length;
5734 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005735 PyObject *str;
5736
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005737 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005738 if (Py_UNICODE_MATCH(self, i, substring)) {
5739 if (maxcount-- <= 0)
5740 break;
5741 SPLIT_APPEND(self->str, j, i);
5742 i = j = i + sublen;
5743 } else
5744 i++;
5745 }
5746 if (j <= len) {
5747 SPLIT_APPEND(self->str, j, len);
5748 }
5749 return list;
5750
5751 onError:
5752 Py_DECREF(list);
5753 return NULL;
5754}
5755
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005756static
5757PyObject *rsplit_whitespace(PyUnicodeObject *self,
5758 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005759 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005760{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005761 register Py_ssize_t i;
5762 register Py_ssize_t j;
5763 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005764 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005765 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005766
5767 for (i = j = len - 1; i >= 0; ) {
5768 /* find a token */
Christian Heimes4d4f2702008-01-30 11:32:37 +00005769 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005770 i--;
5771 j = i;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005772 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005773 i--;
5774 if (j > i) {
5775 if (maxcount-- <= 0)
5776 break;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005777 SPLIT_APPEND(buf, i + 1, j + 1);
5778 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005779 i--;
5780 j = i;
5781 }
5782 }
5783 if (j >= 0) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005784 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005785 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005786 if (PyList_Reverse(list) < 0)
5787 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005788 return list;
5789
5790 onError:
5791 Py_DECREF(list);
5792 return NULL;
5793}
5794
5795static
5796PyObject *rsplit_char(PyUnicodeObject *self,
5797 PyObject *list,
5798 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005799 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005800{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005801 register Py_ssize_t i;
5802 register Py_ssize_t j;
5803 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005804 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005805 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005806
5807 for (i = j = len - 1; i >= 0; ) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005808 if (buf[i] == ch) {
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005809 if (maxcount-- <= 0)
5810 break;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005811 SPLIT_APPEND(buf, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005812 j = i = i - 1;
5813 } else
5814 i--;
5815 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005816 if (j >= -1) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005817 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005818 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005819 if (PyList_Reverse(list) < 0)
5820 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005821 return list;
5822
5823 onError:
5824 Py_DECREF(list);
5825 return NULL;
5826}
5827
5828static
5829PyObject *rsplit_substring(PyUnicodeObject *self,
5830 PyObject *list,
5831 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005832 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005833{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005834 register Py_ssize_t i;
5835 register Py_ssize_t j;
5836 Py_ssize_t len = self->length;
5837 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005838 PyObject *str;
5839
5840 for (i = len - sublen, j = len; i >= 0; ) {
5841 if (Py_UNICODE_MATCH(self, i, substring)) {
5842 if (maxcount-- <= 0)
5843 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005844 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005845 j = i;
5846 i -= sublen;
5847 } else
5848 i--;
5849 }
5850 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005851 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005852 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005853 if (PyList_Reverse(list) < 0)
5854 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005855 return list;
5856
5857 onError:
5858 Py_DECREF(list);
5859 return NULL;
5860}
5861
Guido van Rossumd57fd912000-03-10 22:53:23 +00005862#undef SPLIT_APPEND
5863
5864static
5865PyObject *split(PyUnicodeObject *self,
5866 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005867 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005868{
5869 PyObject *list;
5870
5871 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005872 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005873
5874 list = PyList_New(0);
5875 if (!list)
5876 return NULL;
5877
5878 if (substring == NULL)
5879 return split_whitespace(self,list,maxcount);
5880
5881 else if (substring->length == 1)
5882 return split_char(self,list,substring->str[0],maxcount);
5883
5884 else if (substring->length == 0) {
5885 Py_DECREF(list);
5886 PyErr_SetString(PyExc_ValueError, "empty separator");
5887 return NULL;
5888 }
5889 else
5890 return split_substring(self,list,substring,maxcount);
5891}
5892
Tim Petersced69f82003-09-16 20:30:58 +00005893static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005894PyObject *rsplit(PyUnicodeObject *self,
5895 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005896 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005897{
5898 PyObject *list;
5899
5900 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005901 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005902
5903 list = PyList_New(0);
5904 if (!list)
5905 return NULL;
5906
5907 if (substring == NULL)
5908 return rsplit_whitespace(self,list,maxcount);
5909
5910 else if (substring->length == 1)
5911 return rsplit_char(self,list,substring->str[0],maxcount);
5912
5913 else if (substring->length == 0) {
5914 Py_DECREF(list);
5915 PyErr_SetString(PyExc_ValueError, "empty separator");
5916 return NULL;
5917 }
5918 else
5919 return rsplit_substring(self,list,substring,maxcount);
5920}
5921
5922static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005923PyObject *replace(PyUnicodeObject *self,
5924 PyUnicodeObject *str1,
5925 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005926 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005927{
5928 PyUnicodeObject *u;
5929
5930 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005931 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005932
Fredrik Lundh347ee272006-05-24 16:35:18 +00005933 if (str1->length == str2->length) {
5934 /* same length */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005935 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005936 if (str1->length == 1) {
5937 /* replace characters */
5938 Py_UNICODE u1, u2;
5939 if (!findchar(self->str, self->length, str1->str[0]))
5940 goto nothing;
5941 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5942 if (!u)
5943 return NULL;
5944 Py_UNICODE_COPY(u->str, self->str, self->length);
5945 u1 = str1->str[0];
5946 u2 = str2->str[0];
5947 for (i = 0; i < u->length; i++)
5948 if (u->str[i] == u1) {
5949 if (--maxcount < 0)
5950 break;
5951 u->str[i] = u2;
5952 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005953 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005954 i = fastsearch(
5955 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005956 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00005957 if (i < 0)
5958 goto nothing;
5959 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5960 if (!u)
5961 return NULL;
5962 Py_UNICODE_COPY(u->str, self->str, self->length);
5963 while (i <= self->length - str1->length)
5964 if (Py_UNICODE_MATCH(self, i, str1)) {
5965 if (--maxcount < 0)
5966 break;
5967 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5968 i += str1->length;
5969 } else
5970 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005971 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005972 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005973
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005974 Py_ssize_t n, i, j, e;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005975 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005976 Py_UNICODE *p;
5977
5978 /* replace strings */
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005979 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005980 if (n > maxcount)
5981 n = maxcount;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005982 if (n == 0)
5983 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005984 /* new_size = self->length + n * (str2->length - str1->length)); */
5985 delta = (str2->length - str1->length);
5986 if (delta == 0) {
5987 new_size = self->length;
5988 } else {
5989 product = n * (str2->length - str1->length);
5990 if ((product / (str2->length - str1->length)) != n) {
5991 PyErr_SetString(PyExc_OverflowError,
5992 "replace string is too long");
5993 return NULL;
5994 }
5995 new_size = self->length + product;
5996 if (new_size < 0) {
5997 PyErr_SetString(PyExc_OverflowError,
5998 "replace string is too long");
5999 return NULL;
6000 }
6001 }
6002 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00006003 if (!u)
6004 return NULL;
6005 i = 0;
6006 p = u->str;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006007 e = self->length - str1->length;
Fredrik Lundh347ee272006-05-24 16:35:18 +00006008 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006009 while (n-- > 0) {
6010 /* look for next match */
6011 j = i;
6012 while (j <= e) {
6013 if (Py_UNICODE_MATCH(self, j, str1))
6014 break;
6015 j++;
6016 }
6017 if (j > i) {
6018 if (j > e)
6019 break;
6020 /* copy unchanged part [i:j] */
6021 Py_UNICODE_COPY(p, self->str+i, j-i);
6022 p += j - i;
6023 }
6024 /* copy substitution string */
6025 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00006026 Py_UNICODE_COPY(p, str2->str, str2->length);
6027 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006028 }
6029 i = j + str1->length;
6030 }
6031 if (i < self->length)
6032 /* copy tail [i:] */
6033 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00006034 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006035 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00006036 while (n > 0) {
6037 Py_UNICODE_COPY(p, str2->str, str2->length);
6038 p += str2->length;
6039 if (--n <= 0)
6040 break;
6041 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006042 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00006043 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006044 }
6045 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006046 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00006047
6048nothing:
6049 /* nothing to replace; return original string (when possible) */
6050 if (PyUnicode_CheckExact(self)) {
6051 Py_INCREF(self);
6052 return (PyObject *) self;
6053 }
6054 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006055}
6056
6057/* --- Unicode Object Methods --------------------------------------------- */
6058
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006059PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006060"S.title() -> unicode\n\
6061\n\
6062Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006063characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006064
6065static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006066unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006067{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006068 return fixup(self, fixtitle);
6069}
6070
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006071PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006072"S.capitalize() -> unicode\n\
6073\n\
6074Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006075have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006076
6077static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006078unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006079{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006080 return fixup(self, fixcapitalize);
6081}
6082
6083#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006084PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006085"S.capwords() -> unicode\n\
6086\n\
6087Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006088normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006089
6090static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006091unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006092{
6093 PyObject *list;
6094 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006095 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006096
Guido van Rossumd57fd912000-03-10 22:53:23 +00006097 /* Split into words */
6098 list = split(self, NULL, -1);
6099 if (!list)
6100 return NULL;
6101
6102 /* Capitalize each word */
6103 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6104 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6105 fixcapitalize);
6106 if (item == NULL)
6107 goto onError;
6108 Py_DECREF(PyList_GET_ITEM(list, i));
6109 PyList_SET_ITEM(list, i, item);
6110 }
6111
6112 /* Join the words to form a new string */
6113 item = PyUnicode_Join(NULL, list);
6114
6115onError:
6116 Py_DECREF(list);
6117 return (PyObject *)item;
6118}
6119#endif
6120
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006121/* Argument converter. Coerces to a single unicode character */
6122
6123static int
6124convert_uc(PyObject *obj, void *addr)
6125{
6126 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6127 PyObject *uniobj;
6128 Py_UNICODE *unistr;
6129
6130 uniobj = PyUnicode_FromObject(obj);
6131 if (uniobj == NULL) {
6132 PyErr_SetString(PyExc_TypeError,
6133 "The fill character cannot be converted to Unicode");
6134 return 0;
6135 }
6136 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6137 PyErr_SetString(PyExc_TypeError,
6138 "The fill character must be exactly one character long");
6139 Py_DECREF(uniobj);
6140 return 0;
6141 }
6142 unistr = PyUnicode_AS_UNICODE(uniobj);
6143 *fillcharloc = unistr[0];
6144 Py_DECREF(uniobj);
6145 return 1;
6146}
6147
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006148PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006149"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006150\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006151Return S centered in a Unicode string of length width. Padding is\n\
6152done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006153
6154static PyObject *
6155unicode_center(PyUnicodeObject *self, PyObject *args)
6156{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006157 Py_ssize_t marg, left;
6158 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006159 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006160
Thomas Woutersde017742006-02-16 19:34:37 +00006161 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006162 return NULL;
6163
Tim Peters7a29bd52001-09-12 03:03:31 +00006164 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006165 Py_INCREF(self);
6166 return (PyObject*) self;
6167 }
6168
6169 marg = width - self->length;
6170 left = marg / 2 + (marg & width & 1);
6171
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006172 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006173}
6174
Marc-André Lemburge5034372000-08-08 08:04:29 +00006175#if 0
6176
6177/* This code should go into some future Unicode collation support
6178 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006179 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006180
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006181/* speedy UTF-16 code point order comparison */
6182/* gleaned from: */
6183/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6184
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006185static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006186{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006187 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006188 0, 0, 0, 0, 0, 0, 0, 0,
6189 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006190 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006191};
6192
Guido van Rossumd57fd912000-03-10 22:53:23 +00006193static int
6194unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6195{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006196 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006197
Guido van Rossumd57fd912000-03-10 22:53:23 +00006198 Py_UNICODE *s1 = str1->str;
6199 Py_UNICODE *s2 = str2->str;
6200
6201 len1 = str1->length;
6202 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006203
Guido van Rossumd57fd912000-03-10 22:53:23 +00006204 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006205 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006206
6207 c1 = *s1++;
6208 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006209
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006210 if (c1 > (1<<11) * 26)
6211 c1 += utf16Fixup[c1>>11];
6212 if (c2 > (1<<11) * 26)
6213 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006214 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006215
6216 if (c1 != c2)
6217 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006218
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006219 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006220 }
6221
6222 return (len1 < len2) ? -1 : (len1 != len2);
6223}
6224
Marc-André Lemburge5034372000-08-08 08:04:29 +00006225#else
6226
6227static int
6228unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6229{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006230 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006231
6232 Py_UNICODE *s1 = str1->str;
6233 Py_UNICODE *s2 = str2->str;
6234
6235 len1 = str1->length;
6236 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006237
Marc-André Lemburge5034372000-08-08 08:04:29 +00006238 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006239 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006240
Fredrik Lundh45714e92001-06-26 16:39:36 +00006241 c1 = *s1++;
6242 c2 = *s2++;
6243
6244 if (c1 != c2)
6245 return (c1 < c2) ? -1 : 1;
6246
Marc-André Lemburge5034372000-08-08 08:04:29 +00006247 len1--; len2--;
6248 }
6249
6250 return (len1 < len2) ? -1 : (len1 != len2);
6251}
6252
6253#endif
6254
Guido van Rossumd57fd912000-03-10 22:53:23 +00006255int PyUnicode_Compare(PyObject *left,
6256 PyObject *right)
6257{
6258 PyUnicodeObject *u = NULL, *v = NULL;
6259 int result;
6260
6261 /* Coerce the two arguments */
6262 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6263 if (u == NULL)
6264 goto onError;
6265 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6266 if (v == NULL)
6267 goto onError;
6268
Thomas Wouters7e474022000-07-16 12:04:32 +00006269 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006270 if (v == u) {
6271 Py_DECREF(u);
6272 Py_DECREF(v);
6273 return 0;
6274 }
6275
6276 result = unicode_compare(u, v);
6277
6278 Py_DECREF(u);
6279 Py_DECREF(v);
6280 return result;
6281
6282onError:
6283 Py_XDECREF(u);
6284 Py_XDECREF(v);
6285 return -1;
6286}
6287
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006288PyObject *PyUnicode_RichCompare(PyObject *left,
6289 PyObject *right,
6290 int op)
6291{
6292 int result;
6293
6294 result = PyUnicode_Compare(left, right);
6295 if (result == -1 && PyErr_Occurred())
6296 goto onError;
6297
6298 /* Convert the return value to a Boolean */
6299 switch (op) {
6300 case Py_EQ:
6301 result = (result == 0);
6302 break;
6303 case Py_NE:
6304 result = (result != 0);
6305 break;
6306 case Py_LE:
6307 result = (result <= 0);
6308 break;
6309 case Py_GE:
6310 result = (result >= 0);
6311 break;
6312 case Py_LT:
6313 result = (result == -1);
6314 break;
6315 case Py_GT:
6316 result = (result == 1);
6317 break;
6318 }
6319 return PyBool_FromLong(result);
6320
6321 onError:
6322
6323 /* Standard case
6324
6325 Type errors mean that PyUnicode_FromObject() could not convert
6326 one of the arguments (usually the right hand side) to Unicode,
6327 ie. we can't handle the comparison request. However, it is
6328 possible that the other object knows a comparison method, which
6329 is why we return Py_NotImplemented to give the other object a
6330 chance.
6331
6332 */
6333 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6334 PyErr_Clear();
6335 Py_INCREF(Py_NotImplemented);
6336 return Py_NotImplemented;
6337 }
6338 if (op != Py_EQ && op != Py_NE)
6339 return NULL;
6340
6341 /* Equality comparison.
6342
6343 This is a special case: we silence any PyExc_UnicodeDecodeError
6344 and instead turn it into a PyErr_UnicodeWarning.
6345
6346 */
6347 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6348 return NULL;
6349 PyErr_Clear();
6350 if (PyErr_Warn(PyExc_UnicodeWarning,
6351 (op == Py_EQ) ?
6352 "Unicode equal comparison "
6353 "failed to convert both arguments to Unicode - "
6354 "interpreting them as being unequal" :
6355 "Unicode unequal comparison "
6356 "failed to convert both arguments to Unicode - "
6357 "interpreting them as being unequal"
6358 ) < 0)
6359 return NULL;
6360 result = (op == Py_NE);
6361 return PyBool_FromLong(result);
6362}
6363
Guido van Rossum403d68b2000-03-13 15:55:09 +00006364int PyUnicode_Contains(PyObject *container,
6365 PyObject *element)
6366{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006367 PyObject *str, *sub;
6368 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006369
6370 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006371 sub = PyUnicode_FromObject(element);
6372 if (!sub) {
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006373 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00006374 "'in <string>' requires string as left operand");
Fredrik Lundh833bf942006-05-23 10:12:21 +00006375 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006376 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00006377
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006378 str = PyUnicode_FromObject(container);
6379 if (!str) {
6380 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00006381 return -1;
6382 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00006383
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006384 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00006385
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006386 Py_DECREF(str);
6387 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00006388
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006389 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006390}
6391
Guido van Rossumd57fd912000-03-10 22:53:23 +00006392/* Concat to string or Unicode object giving a new Unicode object. */
6393
6394PyObject *PyUnicode_Concat(PyObject *left,
6395 PyObject *right)
6396{
6397 PyUnicodeObject *u = NULL, *v = NULL, *w;
6398
6399 /* Coerce the two arguments */
6400 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6401 if (u == NULL)
6402 goto onError;
6403 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6404 if (v == NULL)
6405 goto onError;
6406
6407 /* Shortcuts */
6408 if (v == unicode_empty) {
6409 Py_DECREF(v);
6410 return (PyObject *)u;
6411 }
6412 if (u == unicode_empty) {
6413 Py_DECREF(u);
6414 return (PyObject *)v;
6415 }
6416
6417 /* Concat the two Unicode strings */
6418 w = _PyUnicode_New(u->length + v->length);
6419 if (w == NULL)
6420 goto onError;
6421 Py_UNICODE_COPY(w->str, u->str, u->length);
6422 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6423
6424 Py_DECREF(u);
6425 Py_DECREF(v);
6426 return (PyObject *)w;
6427
6428onError:
6429 Py_XDECREF(u);
6430 Py_XDECREF(v);
6431 return NULL;
6432}
6433
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006434PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006435"S.count(sub[, start[, end]]) -> int\n\
6436\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00006437Return the number of non-overlapping occurrences of substring sub in\n\
6438Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006439interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006440
6441static PyObject *
6442unicode_count(PyUnicodeObject *self, PyObject *args)
6443{
6444 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006445 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006446 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006447 PyObject *result;
6448
Guido van Rossumb8872e62000-05-09 14:14:27 +00006449 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6450 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006451 return NULL;
6452
6453 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006454 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006455 if (substring == NULL)
6456 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006457
Fredrik Lundhc8162812006-05-26 19:33:03 +00006458 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006460 result = PyInt_FromSsize_t(
6461 stringlib_count(self->str + start, end - start,
6462 substring->str, substring->length)
6463 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006464
6465 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006466
Guido van Rossumd57fd912000-03-10 22:53:23 +00006467 return result;
6468}
6469
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006470PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006471"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006472\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006473Encodes S using the codec registered for encoding. encoding defaults\n\
6474to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006475handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006476a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6477'xmlcharrefreplace' as well as any other name registered with\n\
6478codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006479
6480static PyObject *
6481unicode_encode(PyUnicodeObject *self, PyObject *args)
6482{
6483 char *encoding = NULL;
6484 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006485 PyObject *v;
6486
Guido van Rossumd57fd912000-03-10 22:53:23 +00006487 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6488 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006489 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006490 if (v == NULL)
6491 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006492 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6493 PyErr_Format(PyExc_TypeError,
6494 "encoder did not return a string/unicode object "
6495 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006496 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006497 Py_DECREF(v);
6498 return NULL;
6499 }
6500 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006501
6502 onError:
6503 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006504}
6505
6506PyDoc_STRVAR(decode__doc__,
6507"S.decode([encoding[,errors]]) -> string or unicode\n\
6508\n\
6509Decodes S using the codec registered for encoding. encoding defaults\n\
6510to the default encoding. errors may be given to set a different error\n\
6511handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6512a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6513as well as any other name registerd with codecs.register_error that is\n\
6514able to handle UnicodeDecodeErrors.");
6515
6516static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00006517unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006518{
6519 char *encoding = NULL;
6520 char *errors = NULL;
6521 PyObject *v;
6522
6523 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
6524 return NULL;
6525 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006526 if (v == NULL)
6527 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006528 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6529 PyErr_Format(PyExc_TypeError,
6530 "decoder did not return a string/unicode object "
6531 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006532 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006533 Py_DECREF(v);
6534 return NULL;
6535 }
6536 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006537
6538 onError:
6539 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006540}
6541
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006542PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006543"S.expandtabs([tabsize]) -> unicode\n\
6544\n\
6545Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006546If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006547
6548static PyObject*
6549unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6550{
6551 Py_UNICODE *e;
6552 Py_UNICODE *p;
6553 Py_UNICODE *q;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006554 Py_UNICODE *qe;
6555 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006556 PyUnicodeObject *u;
6557 int tabsize = 8;
6558
6559 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6560 return NULL;
6561
Thomas Wouters7e474022000-07-16 12:04:32 +00006562 /* First pass: determine size of output string */
Guido van Rossum5bdff602008-03-11 21:18:06 +00006563 i = 0; /* chars up to and including most recent \n or \r */
6564 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6565 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006566 for (p = self->str; p < e; p++)
6567 if (*p == '\t') {
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006568 if (tabsize > 0) {
Guido van Rossum5bdff602008-03-11 21:18:06 +00006569 incr = tabsize - (j % tabsize); /* cannot overflow */
6570 if (j > PY_SSIZE_T_MAX - incr)
6571 goto overflow1;
6572 j += incr;
6573 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006574 }
6575 else {
Guido van Rossum5bdff602008-03-11 21:18:06 +00006576 if (j > PY_SSIZE_T_MAX - 1)
6577 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006578 j++;
6579 if (*p == '\n' || *p == '\r') {
Guido van Rossum5bdff602008-03-11 21:18:06 +00006580 if (i > PY_SSIZE_T_MAX - j)
6581 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006582 i += j;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006583 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006584 }
6585 }
6586
Guido van Rossum5bdff602008-03-11 21:18:06 +00006587 if (i > PY_SSIZE_T_MAX - j)
6588 goto overflow1;
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006589
Guido van Rossumd57fd912000-03-10 22:53:23 +00006590 /* Second pass: create output string and fill it */
6591 u = _PyUnicode_New(i + j);
6592 if (!u)
6593 return NULL;
6594
Guido van Rossum5bdff602008-03-11 21:18:06 +00006595 j = 0; /* same as in first pass */
6596 q = u->str; /* next output char */
6597 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006598
6599 for (p = self->str; p < e; p++)
6600 if (*p == '\t') {
6601 if (tabsize > 0) {
6602 i = tabsize - (j % tabsize);
6603 j += i;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006604 while (i--) {
6605 if (q >= qe)
6606 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006607 *q++ = ' ';
Guido van Rossum5bdff602008-03-11 21:18:06 +00006608 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006609 }
6610 }
6611 else {
Guido van Rossum5bdff602008-03-11 21:18:06 +00006612 if (q >= qe)
6613 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006614 *q++ = *p;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006615 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006616 if (*p == '\n' || *p == '\r')
6617 j = 0;
6618 }
6619
6620 return (PyObject*) u;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006621
6622 overflow2:
6623 Py_DECREF(u);
6624 overflow1:
6625 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6626 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006627}
6628
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006629PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006630"S.find(sub [,start [,end]]) -> int\n\
6631\n\
6632Return the lowest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00006633such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006634arguments start and end are interpreted as in slice notation.\n\
6635\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006636Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006637
6638static PyObject *
6639unicode_find(PyUnicodeObject *self, PyObject *args)
6640{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006641 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006642 Py_ssize_t start;
6643 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006644 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006645
Facundo Batista57d56692007-11-16 18:04:14 +00006646 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006647 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006648
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006649 result = stringlib_find_slice(
6650 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6651 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6652 start, end
6653 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006654
6655 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006656
6657 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006658}
6659
6660static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006661unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006662{
6663 if (index < 0 || index >= self->length) {
6664 PyErr_SetString(PyExc_IndexError, "string index out of range");
6665 return NULL;
6666 }
6667
6668 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6669}
6670
6671static long
6672unicode_hash(PyUnicodeObject *self)
6673{
Fredrik Lundhdde61642000-07-10 18:27:47 +00006674 /* Since Unicode objects compare equal to their ASCII string
6675 counterparts, they should use the individual character values
6676 as basis for their hash value. This is needed to assure that
6677 strings and Unicode objects behave in the same way as
6678 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006679
Martin v. Löwis18e16552006-02-15 17:27:45 +00006680 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006681 register Py_UNICODE *p;
6682 register long x;
6683
Guido van Rossumd57fd912000-03-10 22:53:23 +00006684 if (self->hash != -1)
6685 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006686 len = PyUnicode_GET_SIZE(self);
6687 p = PyUnicode_AS_UNICODE(self);
6688 x = *p << 7;
6689 while (--len >= 0)
6690 x = (1000003*x) ^ *p++;
6691 x ^= PyUnicode_GET_SIZE(self);
6692 if (x == -1)
6693 x = -2;
6694 self->hash = x;
6695 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006696}
6697
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006698PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006699"S.index(sub [,start [,end]]) -> int\n\
6700\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006701Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006702
6703static PyObject *
6704unicode_index(PyUnicodeObject *self, PyObject *args)
6705{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006706 Py_ssize_t result;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006707 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006708 Py_ssize_t start;
6709 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006710
Facundo Batista57d56692007-11-16 18:04:14 +00006711 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006712 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006713
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006714 result = stringlib_find_slice(
6715 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6716 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6717 start, end
6718 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006719
6720 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006721
Guido van Rossumd57fd912000-03-10 22:53:23 +00006722 if (result < 0) {
6723 PyErr_SetString(PyExc_ValueError, "substring not found");
6724 return NULL;
6725 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006726
Martin v. Löwis18e16552006-02-15 17:27:45 +00006727 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006728}
6729
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006730PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006731"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006732\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006733Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006734at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006735
6736static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006737unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006738{
6739 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6740 register const Py_UNICODE *e;
6741 int cased;
6742
Guido van Rossumd57fd912000-03-10 22:53:23 +00006743 /* Shortcut for single character strings */
6744 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006745 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006746
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006747 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006748 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006749 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006750
Guido van Rossumd57fd912000-03-10 22:53:23 +00006751 e = p + PyUnicode_GET_SIZE(self);
6752 cased = 0;
6753 for (; p < e; p++) {
6754 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006755
Guido van Rossumd57fd912000-03-10 22:53:23 +00006756 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006757 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006758 else if (!cased && Py_UNICODE_ISLOWER(ch))
6759 cased = 1;
6760 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006761 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006762}
6763
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006764PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006765"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006766\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006767Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006768at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006769
6770static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006771unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006772{
6773 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6774 register const Py_UNICODE *e;
6775 int cased;
6776
Guido van Rossumd57fd912000-03-10 22:53:23 +00006777 /* Shortcut for single character strings */
6778 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006779 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006780
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006781 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006782 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006783 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006784
Guido van Rossumd57fd912000-03-10 22:53:23 +00006785 e = p + PyUnicode_GET_SIZE(self);
6786 cased = 0;
6787 for (; p < e; p++) {
6788 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006789
Guido van Rossumd57fd912000-03-10 22:53:23 +00006790 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006791 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006792 else if (!cased && Py_UNICODE_ISUPPER(ch))
6793 cased = 1;
6794 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006795 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006796}
6797
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006798PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006799"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006800\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006801Return True if S is a titlecased string and there is at least one\n\
6802character in S, i.e. upper- and titlecase characters may only\n\
6803follow uncased characters and lowercase characters only cased ones.\n\
6804Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006805
6806static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006807unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006808{
6809 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6810 register const Py_UNICODE *e;
6811 int cased, previous_is_cased;
6812
Guido van Rossumd57fd912000-03-10 22:53:23 +00006813 /* Shortcut for single character strings */
6814 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006815 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6816 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006817
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006818 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006819 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006820 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006821
Guido van Rossumd57fd912000-03-10 22:53:23 +00006822 e = p + PyUnicode_GET_SIZE(self);
6823 cased = 0;
6824 previous_is_cased = 0;
6825 for (; p < e; p++) {
6826 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006827
Guido van Rossumd57fd912000-03-10 22:53:23 +00006828 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6829 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006830 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006831 previous_is_cased = 1;
6832 cased = 1;
6833 }
6834 else if (Py_UNICODE_ISLOWER(ch)) {
6835 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006836 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006837 previous_is_cased = 1;
6838 cased = 1;
6839 }
6840 else
6841 previous_is_cased = 0;
6842 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006843 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006844}
6845
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006846PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006847"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006848\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006849Return True if all characters in S are whitespace\n\
6850and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006851
6852static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006853unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006854{
6855 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6856 register const Py_UNICODE *e;
6857
Guido van Rossumd57fd912000-03-10 22:53:23 +00006858 /* Shortcut for single character strings */
6859 if (PyUnicode_GET_SIZE(self) == 1 &&
6860 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006861 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006862
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006863 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006864 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006865 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006866
Guido van Rossumd57fd912000-03-10 22:53:23 +00006867 e = p + PyUnicode_GET_SIZE(self);
6868 for (; p < e; p++) {
6869 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006870 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006871 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006872 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006873}
6874
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006875PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006876"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006877\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006878Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006879and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006880
6881static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006882unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006883{
6884 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6885 register const Py_UNICODE *e;
6886
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006887 /* Shortcut for single character strings */
6888 if (PyUnicode_GET_SIZE(self) == 1 &&
6889 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006890 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006891
6892 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006893 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006894 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006895
6896 e = p + PyUnicode_GET_SIZE(self);
6897 for (; p < e; p++) {
6898 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006899 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006900 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006901 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006902}
6903
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006904PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006905"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006906\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006907Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006908and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006909
6910static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006911unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006912{
6913 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6914 register const Py_UNICODE *e;
6915
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006916 /* Shortcut for single character strings */
6917 if (PyUnicode_GET_SIZE(self) == 1 &&
6918 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006919 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006920
6921 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006922 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006923 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006924
6925 e = p + PyUnicode_GET_SIZE(self);
6926 for (; p < e; p++) {
6927 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006928 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006929 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006930 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006931}
6932
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006933PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006934"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006935\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006936Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006937False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006938
6939static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006940unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006941{
6942 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6943 register const Py_UNICODE *e;
6944
Guido van Rossumd57fd912000-03-10 22:53:23 +00006945 /* Shortcut for single character strings */
6946 if (PyUnicode_GET_SIZE(self) == 1 &&
6947 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006948 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006949
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006950 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006951 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006952 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006953
Guido van Rossumd57fd912000-03-10 22:53:23 +00006954 e = p + PyUnicode_GET_SIZE(self);
6955 for (; p < e; p++) {
6956 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006957 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006958 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006959 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006960}
6961
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006962PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006963"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006964\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006965Return True if all characters in S are digits\n\
6966and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006967
6968static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006969unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006970{
6971 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6972 register const Py_UNICODE *e;
6973
Guido van Rossumd57fd912000-03-10 22:53:23 +00006974 /* Shortcut for single character strings */
6975 if (PyUnicode_GET_SIZE(self) == 1 &&
6976 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006977 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006978
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006979 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006980 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006981 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006982
Guido van Rossumd57fd912000-03-10 22:53:23 +00006983 e = p + PyUnicode_GET_SIZE(self);
6984 for (; p < e; p++) {
6985 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006986 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006987 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006988 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006989}
6990
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006991PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006992"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006993\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006994Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006995False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006996
6997static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006998unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006999{
7000 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7001 register const Py_UNICODE *e;
7002
Guido van Rossumd57fd912000-03-10 22:53:23 +00007003 /* Shortcut for single character strings */
7004 if (PyUnicode_GET_SIZE(self) == 1 &&
7005 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007006 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007007
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007008 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007009 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007010 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007011
Guido van Rossumd57fd912000-03-10 22:53:23 +00007012 e = p + PyUnicode_GET_SIZE(self);
7013 for (; p < e; p++) {
7014 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007015 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007016 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007017 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007018}
7019
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007020PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007021"S.join(sequence) -> unicode\n\
7022\n\
7023Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007024sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007025
7026static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007027unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007028{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007029 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007030}
7031
Martin v. Löwis18e16552006-02-15 17:27:45 +00007032static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007033unicode_length(PyUnicodeObject *self)
7034{
7035 return self->length;
7036}
7037
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007038PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00007039"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007040\n\
7041Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007042done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007043
7044static PyObject *
7045unicode_ljust(PyUnicodeObject *self, PyObject *args)
7046{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007047 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007048 Py_UNICODE fillchar = ' ';
7049
Martin v. Löwis412fb672006-04-13 06:34:32 +00007050 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007051 return NULL;
7052
Tim Peters7a29bd52001-09-12 03:03:31 +00007053 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007054 Py_INCREF(self);
7055 return (PyObject*) self;
7056 }
7057
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007058 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007059}
7060
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007061PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007062"S.lower() -> unicode\n\
7063\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007064Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007065
7066static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007067unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007068{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007069 return fixup(self, fixlower);
7070}
7071
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007072#define LEFTSTRIP 0
7073#define RIGHTSTRIP 1
7074#define BOTHSTRIP 2
7075
7076/* Arrays indexed by above */
7077static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7078
7079#define STRIPNAME(i) (stripformat[i]+3)
7080
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007081/* externally visible for str.strip(unicode) */
7082PyObject *
7083_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7084{
7085 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007086 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007087 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007088 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7089 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007090
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007091 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7092
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007093 i = 0;
7094 if (striptype != RIGHTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007095 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7096 i++;
7097 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007098 }
7099
7100 j = len;
7101 if (striptype != LEFTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007102 do {
7103 j--;
7104 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7105 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007106 }
7107
7108 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007109 Py_INCREF(self);
7110 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007111 }
7112 else
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007113 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007114}
7115
Guido van Rossumd57fd912000-03-10 22:53:23 +00007116
7117static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007118do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007119{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007120 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007121 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007122
7123 i = 0;
7124 if (striptype != RIGHTSTRIP) {
7125 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7126 i++;
7127 }
7128 }
7129
7130 j = len;
7131 if (striptype != LEFTSTRIP) {
7132 do {
7133 j--;
7134 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7135 j++;
7136 }
7137
7138 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7139 Py_INCREF(self);
7140 return (PyObject*)self;
7141 }
7142 else
7143 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007144}
7145
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007146
7147static PyObject *
7148do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7149{
7150 PyObject *sep = NULL;
7151
7152 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7153 return NULL;
7154
7155 if (sep != NULL && sep != Py_None) {
7156 if (PyUnicode_Check(sep))
7157 return _PyUnicode_XStrip(self, striptype, sep);
7158 else if (PyString_Check(sep)) {
7159 PyObject *res;
7160 sep = PyUnicode_FromObject(sep);
7161 if (sep==NULL)
7162 return NULL;
7163 res = _PyUnicode_XStrip(self, striptype, sep);
7164 Py_DECREF(sep);
7165 return res;
7166 }
7167 else {
7168 PyErr_Format(PyExc_TypeError,
7169 "%s arg must be None, unicode or str",
7170 STRIPNAME(striptype));
7171 return NULL;
7172 }
7173 }
7174
7175 return do_strip(self, striptype);
7176}
7177
7178
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007179PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007180"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007181\n\
7182Return a copy of the string S with leading and trailing\n\
7183whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007184If chars is given and not None, remove characters in chars instead.\n\
7185If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007186
7187static PyObject *
7188unicode_strip(PyUnicodeObject *self, PyObject *args)
7189{
7190 if (PyTuple_GET_SIZE(args) == 0)
7191 return do_strip(self, BOTHSTRIP); /* Common case */
7192 else
7193 return do_argstrip(self, BOTHSTRIP, args);
7194}
7195
7196
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007197PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007198"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007199\n\
7200Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007201If chars is given and not None, remove characters in chars instead.\n\
7202If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007203
7204static PyObject *
7205unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7206{
7207 if (PyTuple_GET_SIZE(args) == 0)
7208 return do_strip(self, LEFTSTRIP); /* Common case */
7209 else
7210 return do_argstrip(self, LEFTSTRIP, args);
7211}
7212
7213
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007214PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007215"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007216\n\
7217Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007218If chars is given and not None, remove characters in chars instead.\n\
7219If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007220
7221static PyObject *
7222unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7223{
7224 if (PyTuple_GET_SIZE(args) == 0)
7225 return do_strip(self, RIGHTSTRIP); /* Common case */
7226 else
7227 return do_argstrip(self, RIGHTSTRIP, args);
7228}
7229
7230
Guido van Rossumd57fd912000-03-10 22:53:23 +00007231static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007232unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007233{
7234 PyUnicodeObject *u;
7235 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007236 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007237 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007238
7239 if (len < 0)
7240 len = 0;
7241
Tim Peters7a29bd52001-09-12 03:03:31 +00007242 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007243 /* no repeat, return original string */
7244 Py_INCREF(str);
7245 return (PyObject*) str;
7246 }
Tim Peters8f422462000-09-09 06:13:41 +00007247
7248 /* ensure # of chars needed doesn't overflow int and # of bytes
7249 * needed doesn't overflow size_t
7250 */
7251 nchars = len * str->length;
7252 if (len && nchars / len != str->length) {
7253 PyErr_SetString(PyExc_OverflowError,
7254 "repeated string is too long");
7255 return NULL;
7256 }
7257 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7258 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7259 PyErr_SetString(PyExc_OverflowError,
7260 "repeated string is too long");
7261 return NULL;
7262 }
7263 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007264 if (!u)
7265 return NULL;
7266
7267 p = u->str;
7268
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007269 if (str->length == 1 && len > 0) {
7270 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007271 } else {
Tim Peters1bacc642006-05-23 05:47:16 +00007272 Py_ssize_t done = 0; /* number of characters copied this far */
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007273 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007274 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007275 done = str->length;
7276 }
7277 while (done < nchars) {
Neal Norwitz4677fbf72008-03-25 04:18:18 +00007278 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007279 Py_UNICODE_COPY(p+done, p, n);
7280 done += n;
7281 }
7282 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007283
7284 return (PyObject*) u;
7285}
7286
7287PyObject *PyUnicode_Replace(PyObject *obj,
7288 PyObject *subobj,
7289 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007290 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007291{
7292 PyObject *self;
7293 PyObject *str1;
7294 PyObject *str2;
7295 PyObject *result;
7296
7297 self = PyUnicode_FromObject(obj);
7298 if (self == NULL)
7299 return NULL;
7300 str1 = PyUnicode_FromObject(subobj);
7301 if (str1 == NULL) {
7302 Py_DECREF(self);
7303 return NULL;
7304 }
7305 str2 = PyUnicode_FromObject(replobj);
7306 if (str2 == NULL) {
7307 Py_DECREF(self);
7308 Py_DECREF(str1);
7309 return NULL;
7310 }
Tim Petersced69f82003-09-16 20:30:58 +00007311 result = replace((PyUnicodeObject *)self,
7312 (PyUnicodeObject *)str1,
7313 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007314 maxcount);
7315 Py_DECREF(self);
7316 Py_DECREF(str1);
7317 Py_DECREF(str2);
7318 return result;
7319}
7320
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007321PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007322"S.replace (old, new[, maxsplit]) -> unicode\n\
7323\n\
7324Return a copy of S with all occurrences of substring\n\
7325old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007326given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007327
7328static PyObject*
7329unicode_replace(PyUnicodeObject *self, PyObject *args)
7330{
7331 PyUnicodeObject *str1;
7332 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007333 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007334 PyObject *result;
7335
Martin v. Löwis18e16552006-02-15 17:27:45 +00007336 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007337 return NULL;
7338 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7339 if (str1 == NULL)
7340 return NULL;
7341 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007342 if (str2 == NULL) {
7343 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007344 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007345 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007346
7347 result = replace(self, str1, str2, maxcount);
7348
7349 Py_DECREF(str1);
7350 Py_DECREF(str2);
7351 return result;
7352}
7353
7354static
7355PyObject *unicode_repr(PyObject *unicode)
7356{
7357 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
7358 PyUnicode_GET_SIZE(unicode),
7359 1);
7360}
7361
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007362PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007363"S.rfind(sub [,start [,end]]) -> int\n\
7364\n\
7365Return the highest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00007366such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007367arguments start and end are interpreted as in slice notation.\n\
7368\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007369Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007370
7371static PyObject *
7372unicode_rfind(PyUnicodeObject *self, PyObject *args)
7373{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007374 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007375 Py_ssize_t start;
7376 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007377 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007378
Facundo Batista57d56692007-11-16 18:04:14 +00007379 if (!_ParseTupleFinds(args, &substring, &start, &end))
7380 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007381
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007382 result = stringlib_rfind_slice(
7383 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7384 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7385 start, end
7386 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007387
7388 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007389
7390 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007391}
7392
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007393PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007394"S.rindex(sub [,start [,end]]) -> int\n\
7395\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007396Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007397
7398static PyObject *
7399unicode_rindex(PyUnicodeObject *self, PyObject *args)
7400{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007401 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007402 Py_ssize_t start;
7403 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007404 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007405
Facundo Batista57d56692007-11-16 18:04:14 +00007406 if (!_ParseTupleFinds(args, &substring, &start, &end))
7407 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007408
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007409 result = stringlib_rfind_slice(
7410 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7411 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7412 start, end
7413 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007414
7415 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007416
Guido van Rossumd57fd912000-03-10 22:53:23 +00007417 if (result < 0) {
7418 PyErr_SetString(PyExc_ValueError, "substring not found");
7419 return NULL;
7420 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007421 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007422}
7423
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007424PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007425"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007426\n\
7427Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007428done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007429
7430static PyObject *
7431unicode_rjust(PyUnicodeObject *self, PyObject *args)
7432{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007433 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007434 Py_UNICODE fillchar = ' ';
7435
Martin v. Löwis412fb672006-04-13 06:34:32 +00007436 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007437 return NULL;
7438
Tim Peters7a29bd52001-09-12 03:03:31 +00007439 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007440 Py_INCREF(self);
7441 return (PyObject*) self;
7442 }
7443
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007444 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007445}
7446
Guido van Rossumd57fd912000-03-10 22:53:23 +00007447static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007448unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007449{
7450 /* standard clamping */
7451 if (start < 0)
7452 start = 0;
7453 if (end < 0)
7454 end = 0;
7455 if (end > self->length)
7456 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007457 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007458 /* full slice, return original string */
7459 Py_INCREF(self);
7460 return (PyObject*) self;
7461 }
7462 if (start > end)
7463 start = end;
7464 /* copy slice */
7465 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7466 end - start);
7467}
7468
7469PyObject *PyUnicode_Split(PyObject *s,
7470 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007471 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007472{
7473 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007474
Guido van Rossumd57fd912000-03-10 22:53:23 +00007475 s = PyUnicode_FromObject(s);
7476 if (s == NULL)
7477 return NULL;
7478 if (sep != NULL) {
7479 sep = PyUnicode_FromObject(sep);
7480 if (sep == NULL) {
7481 Py_DECREF(s);
7482 return NULL;
7483 }
7484 }
7485
7486 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7487
7488 Py_DECREF(s);
7489 Py_XDECREF(sep);
7490 return result;
7491}
7492
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007493PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007494"S.split([sep [,maxsplit]]) -> list of strings\n\
7495\n\
7496Return a list of the words in S, using sep as the\n\
7497delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00007498splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00007499any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007500
7501static PyObject*
7502unicode_split(PyUnicodeObject *self, PyObject *args)
7503{
7504 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007505 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007506
Martin v. Löwis18e16552006-02-15 17:27:45 +00007507 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007508 return NULL;
7509
7510 if (substring == Py_None)
7511 return split(self, NULL, maxcount);
7512 else if (PyUnicode_Check(substring))
7513 return split(self, (PyUnicodeObject *)substring, maxcount);
7514 else
7515 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7516}
7517
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007518PyObject *
7519PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7520{
7521 PyObject* str_obj;
7522 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007523 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00007524
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007525 str_obj = PyUnicode_FromObject(str_in);
7526 if (!str_obj)
7527 return NULL;
7528 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00007529 if (!sep_obj) {
7530 Py_DECREF(str_obj);
7531 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007532 }
7533
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007534 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00007535 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7536 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7537 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007538
Fredrik Lundhb9479482006-05-26 17:22:38 +00007539 Py_DECREF(sep_obj);
7540 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007541
7542 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007543}
7544
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007545
7546PyObject *
7547PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7548{
7549 PyObject* str_obj;
7550 PyObject* sep_obj;
7551 PyObject* out;
7552
7553 str_obj = PyUnicode_FromObject(str_in);
7554 if (!str_obj)
7555 return NULL;
7556 sep_obj = PyUnicode_FromObject(sep_in);
7557 if (!sep_obj) {
7558 Py_DECREF(str_obj);
7559 return NULL;
7560 }
7561
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007562 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007563 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7564 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7565 );
7566
7567 Py_DECREF(sep_obj);
7568 Py_DECREF(str_obj);
7569
7570 return out;
7571}
7572
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007573PyDoc_STRVAR(partition__doc__,
7574"S.partition(sep) -> (head, sep, tail)\n\
7575\n\
7576Searches for the separator sep in S, and returns the part before it,\n\
7577the separator itself, and the part after it. If the separator is not\n\
7578found, returns S and two empty strings.");
7579
7580static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00007581unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007582{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007583 return PyUnicode_Partition((PyObject *)self, separator);
7584}
7585
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007586PyDoc_STRVAR(rpartition__doc__,
Raymond Hettingera0c95fa2006-09-04 15:32:48 +00007587"S.rpartition(sep) -> (tail, sep, head)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007588\n\
7589Searches for the separator sep in S, starting at the end of S, and returns\n\
7590the part before it, the separator itself, and the part after it. If the\n\
Raymond Hettingera0c95fa2006-09-04 15:32:48 +00007591separator is not found, returns two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007592
7593static PyObject*
7594unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7595{
7596 return PyUnicode_RPartition((PyObject *)self, separator);
7597}
7598
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007599PyObject *PyUnicode_RSplit(PyObject *s,
7600 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007601 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007602{
7603 PyObject *result;
7604
7605 s = PyUnicode_FromObject(s);
7606 if (s == NULL)
7607 return NULL;
7608 if (sep != NULL) {
7609 sep = PyUnicode_FromObject(sep);
7610 if (sep == NULL) {
7611 Py_DECREF(s);
7612 return NULL;
7613 }
7614 }
7615
7616 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7617
7618 Py_DECREF(s);
7619 Py_XDECREF(sep);
7620 return result;
7621}
7622
7623PyDoc_STRVAR(rsplit__doc__,
7624"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7625\n\
7626Return a list of the words in S, using sep as the\n\
7627delimiter string, starting at the end of the string and\n\
7628working to the front. If maxsplit is given, at most maxsplit\n\
7629splits are done. If sep is not specified, any whitespace string\n\
7630is a separator.");
7631
7632static PyObject*
7633unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7634{
7635 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007636 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007637
Martin v. Löwis18e16552006-02-15 17:27:45 +00007638 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007639 return NULL;
7640
7641 if (substring == Py_None)
7642 return rsplit(self, NULL, maxcount);
7643 else if (PyUnicode_Check(substring))
7644 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7645 else
7646 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7647}
7648
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007649PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007650"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007651\n\
7652Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007653Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007654is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007655
7656static PyObject*
7657unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7658{
Guido van Rossum86662912000-04-11 15:38:46 +00007659 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007660
Guido van Rossum86662912000-04-11 15:38:46 +00007661 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007662 return NULL;
7663
Guido van Rossum86662912000-04-11 15:38:46 +00007664 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007665}
7666
7667static
7668PyObject *unicode_str(PyUnicodeObject *self)
7669{
Fred Drakee4315f52000-05-09 19:53:39 +00007670 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007671}
7672
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007673PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007674"S.swapcase() -> unicode\n\
7675\n\
7676Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007677and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007678
7679static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007680unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007681{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007682 return fixup(self, fixswapcase);
7683}
7684
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007685PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007686"S.translate(table) -> unicode\n\
7687\n\
7688Return a copy of the string S, where all characters have been mapped\n\
7689through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007690Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7691Unmapped characters are left untouched. Characters mapped to None\n\
7692are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007693
7694static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007695unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007696{
Tim Petersced69f82003-09-16 20:30:58 +00007697 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007698 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00007699 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007700 "ignore");
7701}
7702
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007703PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007704"S.upper() -> unicode\n\
7705\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007706Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007707
7708static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007709unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007710{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007711 return fixup(self, fixupper);
7712}
7713
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007714PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007715"S.zfill(width) -> unicode\n\
7716\n\
7717Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007718of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007719
7720static PyObject *
7721unicode_zfill(PyUnicodeObject *self, PyObject *args)
7722{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007723 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007724 PyUnicodeObject *u;
7725
Martin v. Löwis18e16552006-02-15 17:27:45 +00007726 Py_ssize_t width;
7727 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007728 return NULL;
7729
7730 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007731 if (PyUnicode_CheckExact(self)) {
7732 Py_INCREF(self);
7733 return (PyObject*) self;
7734 }
7735 else
7736 return PyUnicode_FromUnicode(
7737 PyUnicode_AS_UNICODE(self),
7738 PyUnicode_GET_SIZE(self)
7739 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007740 }
7741
7742 fill = width - self->length;
7743
7744 u = pad(self, fill, 0, '0');
7745
Walter Dörwald068325e2002-04-15 13:36:47 +00007746 if (u == NULL)
7747 return NULL;
7748
Guido van Rossumd57fd912000-03-10 22:53:23 +00007749 if (u->str[fill] == '+' || u->str[fill] == '-') {
7750 /* move sign to beginning of string */
7751 u->str[0] = u->str[fill];
7752 u->str[fill] = '0';
7753 }
7754
7755 return (PyObject*) u;
7756}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007757
7758#if 0
7759static PyObject*
Christian Heimes5b970ad2008-02-06 13:33:44 +00007760free_listsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007761{
Christian Heimes5b970ad2008-02-06 13:33:44 +00007762 return PyInt_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007763}
7764#endif
7765
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007766PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007767"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007768\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007769Return True if S starts with the specified prefix, False otherwise.\n\
7770With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007771With optional end, stop comparing S at that position.\n\
7772prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007773
7774static PyObject *
7775unicode_startswith(PyUnicodeObject *self,
7776 PyObject *args)
7777{
Georg Brandl24250812006-06-09 18:45:48 +00007778 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007779 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007780 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007781 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007782 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007783
Georg Brandl24250812006-06-09 18:45:48 +00007784 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007785 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007786 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007787 if (PyTuple_Check(subobj)) {
7788 Py_ssize_t i;
7789 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7790 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7791 PyTuple_GET_ITEM(subobj, i));
7792 if (substring == NULL)
7793 return NULL;
7794 result = tailmatch(self, substring, start, end, -1);
7795 Py_DECREF(substring);
7796 if (result) {
7797 Py_RETURN_TRUE;
7798 }
7799 }
7800 /* nothing matched */
7801 Py_RETURN_FALSE;
7802 }
7803 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007804 if (substring == NULL)
Georg Brandl24250812006-06-09 18:45:48 +00007805 return NULL;
7806 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007807 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007808 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007809}
7810
7811
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007812PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007813"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007814\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007815Return True if S ends with the specified suffix, False otherwise.\n\
7816With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007817With optional end, stop comparing S at that position.\n\
7818suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007819
7820static PyObject *
7821unicode_endswith(PyUnicodeObject *self,
7822 PyObject *args)
7823{
Georg Brandl24250812006-06-09 18:45:48 +00007824 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007825 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007826 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007827 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007828 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007829
Georg Brandl24250812006-06-09 18:45:48 +00007830 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7831 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007832 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007833 if (PyTuple_Check(subobj)) {
7834 Py_ssize_t i;
7835 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7836 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7837 PyTuple_GET_ITEM(subobj, i));
7838 if (substring == NULL)
7839 return NULL;
7840 result = tailmatch(self, substring, start, end, +1);
7841 Py_DECREF(substring);
7842 if (result) {
7843 Py_RETURN_TRUE;
7844 }
7845 }
7846 Py_RETURN_FALSE;
7847 }
7848 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007849 if (substring == NULL)
Georg Brandl24250812006-06-09 18:45:48 +00007850 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007851
Georg Brandl24250812006-06-09 18:45:48 +00007852 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007853 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007854 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007855}
7856
7857
Eric Smitha9f7d622008-02-17 19:46:49 +00007858/* Implements do_string_format, which is unicode because of stringlib */
7859#include "stringlib/string_format.h"
7860
7861PyDoc_STRVAR(format__doc__,
7862"S.format(*args, **kwargs) -> unicode\n\
7863\n\
7864");
7865
7866PyDoc_STRVAR(p_format__doc__,
7867"S.__format__(format_spec) -> unicode\n\
7868\n\
7869");
7870
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007871
7872static PyObject *
7873unicode_getnewargs(PyUnicodeObject *v)
7874{
7875 return Py_BuildValue("(u#)", v->str, v->length);
7876}
7877
7878
Guido van Rossumd57fd912000-03-10 22:53:23 +00007879static PyMethodDef unicode_methods[] = {
7880
7881 /* Order is according to common usage: often used methods should
7882 appear first, since lookup is done sequentially. */
7883
Georg Brandlecdc0a92006-03-30 12:19:07 +00007884 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007885 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7886 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007887 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007888 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7889 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7890 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7891 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7892 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7893 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7894 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00007895 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007896 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7897 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7898 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007899 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007900 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007901/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7902 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7903 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7904 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007905 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007906 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007907 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007908 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007909 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7910 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7911 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7912 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7913 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7914 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7915 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7916 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7917 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7918 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7919 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7920 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7921 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7922 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007923 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smitha9f7d622008-02-17 19:46:49 +00007924 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7925 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
7926 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
7927 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Walter Dörwald068325e2002-04-15 13:36:47 +00007928#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007929 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007930#endif
7931
7932#if 0
7933 /* This one is just used for debugging the implementation. */
Christian Heimes5b970ad2008-02-06 13:33:44 +00007934 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007935#endif
7936
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007937 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007938 {NULL, NULL}
7939};
7940
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007941static PyObject *
7942unicode_mod(PyObject *v, PyObject *w)
7943{
7944 if (!PyUnicode_Check(v)) {
7945 Py_INCREF(Py_NotImplemented);
7946 return Py_NotImplemented;
7947 }
7948 return PyUnicode_Format(v, w);
7949}
7950
7951static PyNumberMethods unicode_as_number = {
7952 0, /*nb_add*/
7953 0, /*nb_subtract*/
7954 0, /*nb_multiply*/
7955 0, /*nb_divide*/
7956 unicode_mod, /*nb_remainder*/
7957};
7958
Guido van Rossumd57fd912000-03-10 22:53:23 +00007959static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007960 (lenfunc) unicode_length, /* sq_length */
Georg Brandl347b3002006-03-30 11:57:00 +00007961 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007962 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7963 (ssizeargfunc) unicode_getitem, /* sq_item */
7964 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007965 0, /* sq_ass_item */
7966 0, /* sq_ass_slice */
Georg Brandl347b3002006-03-30 11:57:00 +00007967 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007968};
7969
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007970static PyObject*
7971unicode_subscript(PyUnicodeObject* self, PyObject* item)
7972{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00007973 if (PyIndex_Check(item)) {
7974 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007975 if (i == -1 && PyErr_Occurred())
7976 return NULL;
7977 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007978 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007979 return unicode_getitem(self, i);
7980 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007981 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007982 Py_UNICODE* source_buf;
7983 Py_UNICODE* result_buf;
7984 PyObject* result;
7985
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007986 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007987 &start, &stop, &step, &slicelength) < 0) {
7988 return NULL;
7989 }
7990
7991 if (slicelength <= 0) {
7992 return PyUnicode_FromUnicode(NULL, 0);
Thomas Wouters3ccec682007-08-28 15:28:19 +00007993 } else if (start == 0 && step == 1 && slicelength == self->length &&
7994 PyUnicode_CheckExact(self)) {
7995 Py_INCREF(self);
7996 return (PyObject *)self;
7997 } else if (step == 1) {
7998 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007999 } else {
8000 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Neal Norwitz419fd492008-03-17 20:22:43 +00008001 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8002 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008003
8004 if (result_buf == NULL)
8005 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008006
8007 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8008 result_buf[i] = source_buf[cur];
8009 }
Tim Petersced69f82003-09-16 20:30:58 +00008010
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008011 result = PyUnicode_FromUnicode(result_buf, slicelength);
Neal Norwitz419fd492008-03-17 20:22:43 +00008012 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008013 return result;
8014 }
8015 } else {
8016 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8017 return NULL;
8018 }
8019}
8020
8021static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008022 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008023 (binaryfunc)unicode_subscript, /* mp_subscript */
8024 (objobjargproc)0, /* mp_ass_subscript */
8025};
8026
Martin v. Löwis18e16552006-02-15 17:27:45 +00008027static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008028unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00008029 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008030 const void **ptr)
8031{
8032 if (index != 0) {
8033 PyErr_SetString(PyExc_SystemError,
8034 "accessing non-existent unicode segment");
8035 return -1;
8036 }
8037 *ptr = (void *) self->str;
8038 return PyUnicode_GET_DATA_SIZE(self);
8039}
8040
Martin v. Löwis18e16552006-02-15 17:27:45 +00008041static Py_ssize_t
8042unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008043 const void **ptr)
8044{
8045 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00008046 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008047 return -1;
8048}
8049
8050static int
8051unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00008052 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008053{
8054 if (lenp)
8055 *lenp = PyUnicode_GET_DATA_SIZE(self);
8056 return 1;
8057}
8058
Martin v. Löwiseb079f12006-02-16 14:32:27 +00008059static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008060unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00008061 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008062 const void **ptr)
8063{
8064 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00008065
Guido van Rossumd57fd912000-03-10 22:53:23 +00008066 if (index != 0) {
8067 PyErr_SetString(PyExc_SystemError,
8068 "accessing non-existent unicode segment");
8069 return -1;
8070 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008071 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008072 if (str == NULL)
8073 return -1;
8074 *ptr = (void *) PyString_AS_STRING(str);
8075 return PyString_GET_SIZE(str);
8076}
8077
8078/* Helpers for PyUnicode_Format() */
8079
8080static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008081getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008082{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008083 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008084 if (argidx < arglen) {
8085 (*p_argidx)++;
8086 if (arglen < 0)
8087 return args;
8088 else
8089 return PyTuple_GetItem(args, argidx);
8090 }
8091 PyErr_SetString(PyExc_TypeError,
8092 "not enough arguments for format string");
8093 return NULL;
8094}
8095
8096#define F_LJUST (1<<0)
8097#define F_SIGN (1<<1)
8098#define F_BLANK (1<<2)
8099#define F_ALT (1<<3)
8100#define F_ZERO (1<<4)
8101
Martin v. Löwis18e16552006-02-15 17:27:45 +00008102static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008103strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008104{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008105 register Py_ssize_t i;
8106 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008107 for (i = len - 1; i >= 0; i--)
8108 buffer[i] = (Py_UNICODE) charbuffer[i];
8109
Guido van Rossumd57fd912000-03-10 22:53:23 +00008110 return len;
8111}
8112
Neal Norwitzfc76d632006-01-10 06:03:13 +00008113static int
8114doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8115{
Tim Peters15231542006-02-16 01:08:01 +00008116 Py_ssize_t result;
8117
Neal Norwitzfc76d632006-01-10 06:03:13 +00008118 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008119 result = strtounicode(buffer, (char *)buffer);
8120 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008121}
8122
8123static int
8124longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8125{
Tim Peters15231542006-02-16 01:08:01 +00008126 Py_ssize_t result;
8127
Neal Norwitzfc76d632006-01-10 06:03:13 +00008128 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008129 result = strtounicode(buffer, (char *)buffer);
8130 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008131}
8132
Guido van Rossum078151d2002-08-11 04:24:12 +00008133/* XXX To save some code duplication, formatfloat/long/int could have been
8134 shared with stringobject.c, converting from 8-bit to Unicode after the
8135 formatting is done. */
8136
Guido van Rossumd57fd912000-03-10 22:53:23 +00008137static int
8138formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008139 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008140 int flags,
8141 int prec,
8142 int type,
8143 PyObject *v)
8144{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008145 /* fmt = '%#.' + `prec` + `type`
8146 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00008147 char fmt[20];
8148 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008149
Guido van Rossumd57fd912000-03-10 22:53:23 +00008150 x = PyFloat_AsDouble(v);
8151 if (x == -1.0 && PyErr_Occurred())
8152 return -1;
8153 if (prec < 0)
8154 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008155 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8156 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008157 /* Worst case length calc to ensure no buffer overrun:
8158
8159 'g' formats:
8160 fmt = %#.<prec>g
8161 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8162 for any double rep.)
8163 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8164
8165 'f' formats:
8166 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8167 len = 1 + 50 + 1 + prec = 52 + prec
8168
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008169 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008170 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008171
8172 */
Georg Brandl7c3b50d2007-07-12 08:38:00 +00008173 if (((type == 'g' || type == 'G') &&
8174 buflen <= (size_t)10 + (size_t)prec) ||
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008175 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008176 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008177 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008178 return -1;
8179 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008180 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8181 (flags&F_ALT) ? "#" : "",
8182 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008183 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008184}
8185
Tim Peters38fd5b62000-09-21 05:43:11 +00008186static PyObject*
8187formatlong(PyObject *val, int flags, int prec, int type)
8188{
8189 char *buf;
8190 int i, len;
8191 PyObject *str; /* temporary string object. */
8192 PyUnicodeObject *result;
8193
8194 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8195 if (!str)
8196 return NULL;
8197 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008198 if (!result) {
8199 Py_DECREF(str);
8200 return NULL;
8201 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008202 for (i = 0; i < len; i++)
8203 result->str[i] = buf[i];
8204 result->str[len] = 0;
8205 Py_DECREF(str);
8206 return (PyObject*)result;
8207}
8208
Guido van Rossumd57fd912000-03-10 22:53:23 +00008209static int
8210formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008211 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008212 int flags,
8213 int prec,
8214 int type,
8215 PyObject *v)
8216{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008217 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008218 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8219 * + 1 + 1
8220 * = 24
8221 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008222 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008223 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008224 long x;
8225
8226 x = PyInt_AsLong(v);
8227 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008228 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008229 if (x < 0 && type == 'u') {
8230 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008231 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008232 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8233 sign = "-";
8234 else
8235 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008236 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008237 prec = 1;
8238
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008239 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8240 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008241 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008242 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008243 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008244 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008245 return -1;
8246 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008247
8248 if ((flags & F_ALT) &&
8249 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00008250 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008251 * of issues that cause pain:
8252 * - when 0 is being converted, the C standard leaves off
8253 * the '0x' or '0X', which is inconsistent with other
8254 * %#x/%#X conversions and inconsistent with Python's
8255 * hex() function
8256 * - there are platforms that violate the standard and
8257 * convert 0 with the '0x' or '0X'
8258 * (Metrowerks, Compaq Tru64)
8259 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008260 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008261 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008262 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008263 * We can achieve the desired consistency by inserting our
8264 * own '0x' or '0X' prefix, and substituting %x/%X in place
8265 * of %#x/%#X.
8266 *
8267 * Note that this is the same approach as used in
8268 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008269 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008270 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8271 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008272 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008273 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008274 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8275 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008276 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008277 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008278 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008279 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008280 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008281 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008282}
8283
8284static int
8285formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008286 size_t buflen,
8287 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008288{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008289 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008290 if (PyUnicode_Check(v)) {
8291 if (PyUnicode_GET_SIZE(v) != 1)
8292 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008293 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008294 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008295
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008296 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00008297 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008298 goto onError;
8299 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
8300 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008301
8302 else {
8303 /* Integer input truncated to a character */
8304 long x;
8305 x = PyInt_AsLong(v);
8306 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008307 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008308#ifdef Py_UNICODE_WIDE
8309 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008310 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008311 "%c arg not in range(0x110000) "
8312 "(wide Python build)");
8313 return -1;
8314 }
8315#else
8316 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008317 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008318 "%c arg not in range(0x10000) "
8319 "(narrow Python build)");
8320 return -1;
8321 }
8322#endif
8323 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008324 }
8325 buf[1] = '\0';
8326 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008327
8328 onError:
8329 PyErr_SetString(PyExc_TypeError,
8330 "%c requires int or char");
8331 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008332}
8333
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008334/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8335
8336 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8337 chars are formatted. XXX This is a magic number. Each formatting
8338 routine does bounds checking to ensure no overflow, but a better
8339 solution may be to malloc a buffer of appropriate size for each
8340 format. For now, the current solution is sufficient.
8341*/
8342#define FORMATBUFLEN (size_t)120
8343
Guido van Rossumd57fd912000-03-10 22:53:23 +00008344PyObject *PyUnicode_Format(PyObject *format,
8345 PyObject *args)
8346{
8347 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008348 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008349 int args_owned = 0;
8350 PyUnicodeObject *result = NULL;
8351 PyObject *dict = NULL;
8352 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008353
Guido van Rossumd57fd912000-03-10 22:53:23 +00008354 if (format == NULL || args == NULL) {
8355 PyErr_BadInternalCall();
8356 return NULL;
8357 }
8358 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008359 if (uformat == NULL)
8360 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008361 fmt = PyUnicode_AS_UNICODE(uformat);
8362 fmtcnt = PyUnicode_GET_SIZE(uformat);
8363
8364 reslen = rescnt = fmtcnt + 100;
8365 result = _PyUnicode_New(reslen);
8366 if (result == NULL)
8367 goto onError;
8368 res = PyUnicode_AS_UNICODE(result);
8369
8370 if (PyTuple_Check(args)) {
8371 arglen = PyTuple_Size(args);
8372 argidx = 0;
8373 }
8374 else {
8375 arglen = -1;
8376 argidx = -2;
8377 }
Christian Heimese93237d2007-12-19 02:37:44 +00008378 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008379 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008380 dict = args;
8381
8382 while (--fmtcnt >= 0) {
8383 if (*fmt != '%') {
8384 if (--rescnt < 0) {
8385 rescnt = fmtcnt + 100;
8386 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008387 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008388 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008389 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8390 --rescnt;
8391 }
8392 *res++ = *fmt++;
8393 }
8394 else {
8395 /* Got a format specifier */
8396 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008397 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008398 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008399 Py_UNICODE c = '\0';
8400 Py_UNICODE fill;
Facundo Batistac11cecf2008-02-24 03:17:21 +00008401 int isnumok;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008402 PyObject *v = NULL;
8403 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008404 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008405 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008406 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008407 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008408
8409 fmt++;
8410 if (*fmt == '(') {
8411 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008412 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008413 PyObject *key;
8414 int pcount = 1;
8415
8416 if (dict == NULL) {
8417 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008418 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008419 goto onError;
8420 }
8421 ++fmt;
8422 --fmtcnt;
8423 keystart = fmt;
8424 /* Skip over balanced parentheses */
8425 while (pcount > 0 && --fmtcnt >= 0) {
8426 if (*fmt == ')')
8427 --pcount;
8428 else if (*fmt == '(')
8429 ++pcount;
8430 fmt++;
8431 }
8432 keylen = fmt - keystart - 1;
8433 if (fmtcnt < 0 || pcount > 0) {
8434 PyErr_SetString(PyExc_ValueError,
8435 "incomplete format key");
8436 goto onError;
8437 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008438#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008439 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008440 then looked up since Python uses strings to hold
8441 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008442 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008443 key = PyUnicode_EncodeUTF8(keystart,
8444 keylen,
8445 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008446#else
8447 key = PyUnicode_FromUnicode(keystart, keylen);
8448#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008449 if (key == NULL)
8450 goto onError;
8451 if (args_owned) {
8452 Py_DECREF(args);
8453 args_owned = 0;
8454 }
8455 args = PyObject_GetItem(dict, key);
8456 Py_DECREF(key);
8457 if (args == NULL) {
8458 goto onError;
8459 }
8460 args_owned = 1;
8461 arglen = -1;
8462 argidx = -2;
8463 }
8464 while (--fmtcnt >= 0) {
8465 switch (c = *fmt++) {
8466 case '-': flags |= F_LJUST; continue;
8467 case '+': flags |= F_SIGN; continue;
8468 case ' ': flags |= F_BLANK; continue;
8469 case '#': flags |= F_ALT; continue;
8470 case '0': flags |= F_ZERO; continue;
8471 }
8472 break;
8473 }
8474 if (c == '*') {
8475 v = getnextarg(args, arglen, &argidx);
8476 if (v == NULL)
8477 goto onError;
8478 if (!PyInt_Check(v)) {
8479 PyErr_SetString(PyExc_TypeError,
8480 "* wants int");
8481 goto onError;
8482 }
8483 width = PyInt_AsLong(v);
8484 if (width < 0) {
8485 flags |= F_LJUST;
8486 width = -width;
8487 }
8488 if (--fmtcnt >= 0)
8489 c = *fmt++;
8490 }
8491 else if (c >= '0' && c <= '9') {
8492 width = c - '0';
8493 while (--fmtcnt >= 0) {
8494 c = *fmt++;
8495 if (c < '0' || c > '9')
8496 break;
8497 if ((width*10) / 10 != width) {
8498 PyErr_SetString(PyExc_ValueError,
8499 "width too big");
8500 goto onError;
8501 }
8502 width = width*10 + (c - '0');
8503 }
8504 }
8505 if (c == '.') {
8506 prec = 0;
8507 if (--fmtcnt >= 0)
8508 c = *fmt++;
8509 if (c == '*') {
8510 v = getnextarg(args, arglen, &argidx);
8511 if (v == NULL)
8512 goto onError;
8513 if (!PyInt_Check(v)) {
8514 PyErr_SetString(PyExc_TypeError,
8515 "* wants int");
8516 goto onError;
8517 }
8518 prec = PyInt_AsLong(v);
8519 if (prec < 0)
8520 prec = 0;
8521 if (--fmtcnt >= 0)
8522 c = *fmt++;
8523 }
8524 else if (c >= '0' && c <= '9') {
8525 prec = c - '0';
8526 while (--fmtcnt >= 0) {
8527 c = Py_CHARMASK(*fmt++);
8528 if (c < '0' || c > '9')
8529 break;
8530 if ((prec*10) / 10 != prec) {
8531 PyErr_SetString(PyExc_ValueError,
8532 "prec too big");
8533 goto onError;
8534 }
8535 prec = prec*10 + (c - '0');
8536 }
8537 }
8538 } /* prec */
8539 if (fmtcnt >= 0) {
8540 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008541 if (--fmtcnt >= 0)
8542 c = *fmt++;
8543 }
8544 }
8545 if (fmtcnt < 0) {
8546 PyErr_SetString(PyExc_ValueError,
8547 "incomplete format");
8548 goto onError;
8549 }
8550 if (c != '%') {
8551 v = getnextarg(args, arglen, &argidx);
8552 if (v == NULL)
8553 goto onError;
8554 }
8555 sign = 0;
8556 fill = ' ';
8557 switch (c) {
8558
8559 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008560 pbuf = formatbuf;
8561 /* presume that buffer length is at least 1 */
8562 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008563 len = 1;
8564 break;
8565
8566 case 's':
8567 case 'r':
8568 if (PyUnicode_Check(v) && c == 's') {
8569 temp = v;
8570 Py_INCREF(temp);
8571 }
8572 else {
8573 PyObject *unicode;
8574 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008575 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008576 else
8577 temp = PyObject_Repr(v);
8578 if (temp == NULL)
8579 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008580 if (PyUnicode_Check(temp))
8581 /* nothing to do */;
8582 else if (PyString_Check(temp)) {
8583 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00008584 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00008585 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00008586 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008587 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008588 Py_DECREF(temp);
8589 temp = unicode;
8590 if (temp == NULL)
8591 goto onError;
8592 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008593 else {
8594 Py_DECREF(temp);
8595 PyErr_SetString(PyExc_TypeError,
8596 "%s argument has non-string str()");
8597 goto onError;
8598 }
8599 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008600 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008601 len = PyUnicode_GET_SIZE(temp);
8602 if (prec >= 0 && len > prec)
8603 len = prec;
8604 break;
8605
8606 case 'i':
8607 case 'd':
8608 case 'u':
8609 case 'o':
8610 case 'x':
8611 case 'X':
8612 if (c == 'i')
8613 c = 'd';
Facundo Batistac11cecf2008-02-24 03:17:21 +00008614 isnumok = 0;
8615 if (PyNumber_Check(v)) {
8616 PyObject *iobj=NULL;
8617
8618 if (PyInt_Check(v) || (PyLong_Check(v))) {
8619 iobj = v;
8620 Py_INCREF(iobj);
8621 }
8622 else {
8623 iobj = PyNumber_Int(v);
8624 if (iobj==NULL) iobj = PyNumber_Long(v);
8625 }
8626 if (iobj!=NULL) {
8627 if (PyInt_Check(iobj)) {
8628 isnumok = 1;
8629 pbuf = formatbuf;
8630 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8631 flags, prec, c, iobj);
8632 Py_DECREF(iobj);
8633 if (len < 0)
8634 goto onError;
8635 sign = 1;
8636 }
8637 else if (PyLong_Check(iobj)) {
8638 isnumok = 1;
8639 temp = formatlong(iobj, flags, prec, c);
8640 Py_DECREF(iobj);
8641 if (!temp)
8642 goto onError;
8643 pbuf = PyUnicode_AS_UNICODE(temp);
8644 len = PyUnicode_GET_SIZE(temp);
8645 sign = 1;
8646 }
8647 else {
8648 Py_DECREF(iobj);
8649 }
8650 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008651 }
Facundo Batistac11cecf2008-02-24 03:17:21 +00008652 if (!isnumok) {
8653 PyErr_Format(PyExc_TypeError,
8654 "%%%c format: a number is required, "
Martin v. Löwisd918e4e2008-04-07 03:08:28 +00008655 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
Tim Peters38fd5b62000-09-21 05:43:11 +00008656 goto onError;
Tim Peters38fd5b62000-09-21 05:43:11 +00008657 }
8658 if (flags & F_ZERO)
8659 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008660 break;
8661
8662 case 'e':
8663 case 'E':
8664 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008665 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008666 case 'g':
8667 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008668 if (c == 'F')
8669 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008670 pbuf = formatbuf;
8671 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8672 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008673 if (len < 0)
8674 goto onError;
8675 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008676 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008677 fill = '0';
8678 break;
8679
8680 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008681 pbuf = formatbuf;
8682 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008683 if (len < 0)
8684 goto onError;
8685 break;
8686
8687 default:
8688 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008689 "unsupported format character '%c' (0x%x) "
Armin Rigo7ccbca92006-10-04 12:17:45 +00008690 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008691 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008692 (int)c,
Armin Rigo7ccbca92006-10-04 12:17:45 +00008693 (Py_ssize_t)(fmt - 1 -
8694 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008695 goto onError;
8696 }
8697 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008698 if (*pbuf == '-' || *pbuf == '+') {
8699 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008700 len--;
8701 }
8702 else if (flags & F_SIGN)
8703 sign = '+';
8704 else if (flags & F_BLANK)
8705 sign = ' ';
8706 else
8707 sign = 0;
8708 }
8709 if (width < len)
8710 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008711 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008712 reslen -= rescnt;
8713 rescnt = width + fmtcnt + 100;
8714 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008715 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008716 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008717 PyErr_NoMemory();
8718 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008719 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008720 if (_PyUnicode_Resize(&result, reslen) < 0) {
8721 Py_XDECREF(temp);
8722 goto onError;
8723 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008724 res = PyUnicode_AS_UNICODE(result)
8725 + reslen - rescnt;
8726 }
8727 if (sign) {
8728 if (fill != ' ')
8729 *res++ = sign;
8730 rescnt--;
8731 if (width > len)
8732 width--;
8733 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008734 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8735 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008736 assert(pbuf[1] == c);
8737 if (fill != ' ') {
8738 *res++ = *pbuf++;
8739 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008740 }
Tim Petersfff53252001-04-12 18:38:48 +00008741 rescnt -= 2;
8742 width -= 2;
8743 if (width < 0)
8744 width = 0;
8745 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008746 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008747 if (width > len && !(flags & F_LJUST)) {
8748 do {
8749 --rescnt;
8750 *res++ = fill;
8751 } while (--width > len);
8752 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008753 if (fill == ' ') {
8754 if (sign)
8755 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00008756 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008757 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008758 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008759 *res++ = *pbuf++;
8760 *res++ = *pbuf++;
8761 }
8762 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008763 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008764 res += len;
8765 rescnt -= len;
8766 while (--width >= len) {
8767 --rescnt;
8768 *res++ = ' ';
8769 }
8770 if (dict && (argidx < arglen) && c != '%') {
8771 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008772 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008773 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008774 goto onError;
8775 }
8776 Py_XDECREF(temp);
8777 } /* '%' */
8778 } /* until end */
8779 if (argidx < arglen && !dict) {
8780 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008781 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008782 goto onError;
8783 }
8784
Thomas Woutersa96affe2006-03-12 00:29:36 +00008785 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8786 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008787 if (args_owned) {
8788 Py_DECREF(args);
8789 }
8790 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008791 return (PyObject *)result;
8792
8793 onError:
8794 Py_XDECREF(result);
8795 Py_DECREF(uformat);
8796 if (args_owned) {
8797 Py_DECREF(args);
8798 }
8799 return NULL;
8800}
8801
8802static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008803 (readbufferproc) unicode_buffer_getreadbuf,
8804 (writebufferproc) unicode_buffer_getwritebuf,
8805 (segcountproc) unicode_buffer_getsegcount,
8806 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008807};
8808
Jeremy Hylton938ace62002-07-17 16:30:39 +00008809static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008810unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8811
Tim Peters6d6c1a32001-08-02 04:15:00 +00008812static PyObject *
8813unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8814{
8815 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00008816 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008817 char *encoding = NULL;
8818 char *errors = NULL;
8819
Guido van Rossume023fe02001-08-30 03:12:59 +00008820 if (type != &PyUnicode_Type)
8821 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008822 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8823 kwlist, &x, &encoding, &errors))
8824 return NULL;
8825 if (x == NULL)
8826 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008827 if (encoding == NULL && errors == NULL)
8828 return PyObject_Unicode(x);
8829 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008830 return PyUnicode_FromEncodedObject(x, encoding, errors);
8831}
8832
Guido van Rossume023fe02001-08-30 03:12:59 +00008833static PyObject *
8834unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8835{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008836 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008837 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008838
8839 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8840 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8841 if (tmp == NULL)
8842 return NULL;
8843 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008844 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008845 if (pnew == NULL) {
8846 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008847 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008848 }
Neal Norwitz419fd492008-03-17 20:22:43 +00008849 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008850 if (pnew->str == NULL) {
8851 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008852 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008853 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008854 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008855 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008856 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8857 pnew->length = n;
8858 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008859 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008860 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008861}
8862
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008863PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00008864"unicode(string [, encoding[, errors]]) -> object\n\
8865\n\
8866Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008867encoding defaults to the current default string encoding.\n\
8868errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008869
Guido van Rossumd57fd912000-03-10 22:53:23 +00008870PyTypeObject PyUnicode_Type = {
Martin v. Löwis68192102007-07-21 06:55:02 +00008871 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008872 "unicode", /* tp_name */
8873 sizeof(PyUnicodeObject), /* tp_size */
8874 0, /* tp_itemsize */
8875 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008876 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008877 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008878 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008879 0, /* tp_setattr */
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00008880 0, /* tp_compare */
Georg Brandl347b3002006-03-30 11:57:00 +00008881 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008882 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008883 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008884 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008885 (hashfunc) unicode_hash, /* tp_hash*/
8886 0, /* tp_call*/
8887 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008888 PyObject_GenericGetAttr, /* tp_getattro */
8889 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008890 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008891 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
Neal Norwitzee3a1b52007-02-25 19:44:48 +00008892 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008893 unicode_doc, /* tp_doc */
8894 0, /* tp_traverse */
8895 0, /* tp_clear */
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00008896 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008897 0, /* tp_weaklistoffset */
8898 0, /* tp_iter */
8899 0, /* tp_iternext */
8900 unicode_methods, /* tp_methods */
8901 0, /* tp_members */
8902 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008903 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008904 0, /* tp_dict */
8905 0, /* tp_descr_get */
8906 0, /* tp_descr_set */
8907 0, /* tp_dictoffset */
8908 0, /* tp_init */
8909 0, /* tp_alloc */
8910 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008911 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008912};
8913
8914/* Initialize the Unicode implementation */
8915
Thomas Wouters78890102000-07-22 19:25:51 +00008916void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008917{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008918 int i;
8919
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008920 /* XXX - move this array to unicodectype.c ? */
8921 Py_UNICODE linebreak[] = {
8922 0x000A, /* LINE FEED */
8923 0x000D, /* CARRIAGE RETURN */
8924 0x001C, /* FILE SEPARATOR */
8925 0x001D, /* GROUP SEPARATOR */
8926 0x001E, /* RECORD SEPARATOR */
8927 0x0085, /* NEXT LINE */
8928 0x2028, /* LINE SEPARATOR */
8929 0x2029, /* PARAGRAPH SEPARATOR */
8930 };
8931
Fred Drakee4315f52000-05-09 19:53:39 +00008932 /* Init the implementation */
Christian Heimes5b970ad2008-02-06 13:33:44 +00008933 free_list = NULL;
8934 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008935 unicode_empty = _PyUnicode_New(0);
Neal Norwitze1fdb322006-07-21 05:32:28 +00008936 if (!unicode_empty)
8937 return;
8938
Marc-André Lemburg90e81472000-06-07 09:13:21 +00008939 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008940 for (i = 0; i < 256; i++)
8941 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008942 if (PyType_Ready(&PyUnicode_Type) < 0)
8943 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008944
8945 /* initialize the linebreak bloom filter */
8946 bloom_linebreak = make_bloom_mask(
8947 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8948 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00008949
8950 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008951}
8952
8953/* Finalize the Unicode implementation */
8954
Christian Heimes3b718a72008-02-14 12:47:33 +00008955int
8956PyUnicode_ClearFreeList(void)
8957{
8958 int freelist_size = numfree;
8959 PyUnicodeObject *u;
8960
8961 for (u = free_list; u != NULL;) {
8962 PyUnicodeObject *v = u;
8963 u = *(PyUnicodeObject **)u;
8964 if (v->str)
Neal Norwitz419fd492008-03-17 20:22:43 +00008965 PyObject_DEL(v->str);
Christian Heimes3b718a72008-02-14 12:47:33 +00008966 Py_XDECREF(v->defenc);
8967 PyObject_Del(v);
8968 numfree--;
8969 }
8970 free_list = NULL;
8971 assert(numfree == 0);
8972 return freelist_size;
8973}
8974
Guido van Rossumd57fd912000-03-10 22:53:23 +00008975void
Thomas Wouters78890102000-07-22 19:25:51 +00008976_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008977{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008978 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008979
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008980 Py_XDECREF(unicode_empty);
8981 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008982
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008983 for (i = 0; i < 256; i++) {
8984 if (unicode_latin1[i]) {
8985 Py_DECREF(unicode_latin1[i]);
8986 unicode_latin1[i] = NULL;
8987 }
8988 }
Christian Heimes3b718a72008-02-14 12:47:33 +00008989 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00008990}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008991
Anthony Baxterac6bd462006-04-13 02:06:09 +00008992#ifdef __cplusplus
8993}
8994#endif
8995
8996
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008997/*
8998Local variables:
8999c-basic-offset: 4
9000indent-tabs-mode: nil
9001End:
9002*/